Actual source code: baij2.c
petsc-3.7.4 2016-10-02
2: #include <../src/mat/impls/baij/seq/baij.h>
3: #include <petsc/private/kernels/blockinvert.h>
4: #include <petscbt.h>
5: #include <petscblaslapack.h>
9: PetscErrorCode MatIncreaseOverlap_SeqBAIJ(Mat A,PetscInt is_max,IS is[],PetscInt ov)
10: {
11: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
13: PetscInt row,i,j,k,l,m,n,*nidx,isz,val,ival;
14: const PetscInt *idx;
15: PetscInt start,end,*ai,*aj,bs,*nidx2;
16: PetscBT table;
19: m = a->mbs;
20: ai = a->i;
21: aj = a->j;
22: bs = A->rmap->bs;
24: if (ov < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Negative overlap specified");
26: PetscBTCreate(m,&table);
27: PetscMalloc1(m+1,&nidx);
28: PetscMalloc1(A->rmap->N+1,&nidx2);
30: for (i=0; i<is_max; i++) {
31: /* Initialise the two local arrays */
32: isz = 0;
33: PetscBTMemzero(m,table);
35: /* Extract the indices, assume there can be duplicate entries */
36: ISGetIndices(is[i],&idx);
37: ISGetLocalSize(is[i],&n);
39: /* Enter these into the temp arrays i.e mark table[row], enter row into new index */
40: for (j=0; j<n; ++j) {
41: ival = idx[j]/bs; /* convert the indices into block indices */
42: if (ival>=m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"index greater than mat-dim");
43: if (!PetscBTLookupSet(table,ival)) nidx[isz++] = ival;
44: }
45: ISRestoreIndices(is[i],&idx);
46: ISDestroy(&is[i]);
48: k = 0;
49: for (j=0; j<ov; j++) { /* for each overlap*/
50: n = isz;
51: for (; k<n; k++) { /* do only those rows in nidx[k], which are not done yet */
52: row = nidx[k];
53: start = ai[row];
54: end = ai[row+1];
55: for (l = start; l<end; l++) {
56: val = aj[l];
57: if (!PetscBTLookupSet(table,val)) nidx[isz++] = val;
58: }
59: }
60: }
61: /* expand the Index Set */
62: for (j=0; j<isz; j++) {
63: for (k=0; k<bs; k++) nidx2[j*bs+k] = nidx[j]*bs+k;
64: }
65: ISCreateGeneral(PETSC_COMM_SELF,isz*bs,nidx2,PETSC_COPY_VALUES,is+i);
66: }
67: PetscBTDestroy(&table);
68: PetscFree(nidx);
69: PetscFree(nidx2);
70: return(0);
71: }
75: PetscErrorCode MatGetSubMatrix_SeqBAIJ_Private(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
76: {
77: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*c;
79: PetscInt *smap,i,k,kstart,kend,oldcols = a->nbs,*lens;
80: PetscInt row,mat_i,*mat_j,tcol,*mat_ilen;
81: const PetscInt *irow,*icol;
82: PetscInt nrows,ncols,*ssmap,bs=A->rmap->bs,bs2=a->bs2;
83: PetscInt *aj = a->j,*ai = a->i;
84: MatScalar *mat_a;
85: Mat C;
86: PetscBool flag;
89: ISGetIndices(isrow,&irow);
90: ISGetIndices(iscol,&icol);
91: ISGetLocalSize(isrow,&nrows);
92: ISGetLocalSize(iscol,&ncols);
94: PetscCalloc1(1+oldcols,&smap);
95: ssmap = smap;
96: PetscMalloc1(1+nrows,&lens);
97: for (i=0; i<ncols; i++) smap[icol[i]] = i+1;
98: /* determine lens of each row */
99: for (i=0; i<nrows; i++) {
100: kstart = ai[irow[i]];
101: kend = kstart + a->ilen[irow[i]];
102: lens[i] = 0;
103: for (k=kstart; k<kend; k++) {
104: if (ssmap[aj[k]]) lens[i]++;
105: }
106: }
107: /* Create and fill new matrix */
108: if (scall == MAT_REUSE_MATRIX) {
109: c = (Mat_SeqBAIJ*)((*B)->data);
111: if (c->mbs!=nrows || c->nbs!=ncols || (*B)->rmap->bs!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Submatrix wrong size");
112: PetscMemcmp(c->ilen,lens,c->mbs *sizeof(PetscInt),&flag);
113: if (!flag) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Cannot reuse matrix. wrong no of nonzeros");
114: PetscMemzero(c->ilen,c->mbs*sizeof(PetscInt));
115: C = *B;
116: } else {
117: MatCreate(PetscObjectComm((PetscObject)A),&C);
118: MatSetSizes(C,nrows*bs,ncols*bs,PETSC_DETERMINE,PETSC_DETERMINE);
119: MatSetType(C,((PetscObject)A)->type_name);
120: MatSeqBAIJSetPreallocation_SeqBAIJ(C,bs,0,lens);
121: }
122: c = (Mat_SeqBAIJ*)(C->data);
123: for (i=0; i<nrows; i++) {
124: row = irow[i];
125: kstart = ai[row];
126: kend = kstart + a->ilen[row];
127: mat_i = c->i[i];
128: mat_j = c->j + mat_i;
129: mat_a = c->a + mat_i*bs2;
130: mat_ilen = c->ilen + i;
131: for (k=kstart; k<kend; k++) {
132: if ((tcol=ssmap[a->j[k]])) {
133: *mat_j++ = tcol - 1;
134: PetscMemcpy(mat_a,a->a+k*bs2,bs2*sizeof(MatScalar));
135: mat_a += bs2;
136: (*mat_ilen)++;
137: }
138: }
139: }
140: /* sort */
141: {
142: MatScalar *work;
143: PetscMalloc1(bs2,&work);
144: for (i=0; i<nrows; i++) {
145: PetscInt ilen;
146: mat_i = c->i[i];
147: mat_j = c->j + mat_i;
148: mat_a = c->a + mat_i*bs2;
149: ilen = c->ilen[i];
150: PetscSortIntWithDataArray(ilen,mat_j,mat_a,bs2*sizeof(MatScalar),work);
151: }
152: PetscFree(work);
153: }
155: /* Free work space */
156: ISRestoreIndices(iscol,&icol);
157: PetscFree(smap);
158: PetscFree(lens);
159: MatAssemblyBegin(C,MAT_FINAL_ASSEMBLY);
160: MatAssemblyEnd(C,MAT_FINAL_ASSEMBLY);
162: ISRestoreIndices(isrow,&irow);
163: *B = C;
164: return(0);
165: }
169: PetscErrorCode MatGetSubMatrix_SeqBAIJ(Mat A,IS isrow,IS iscol,MatReuse scall,Mat *B)
170: {
171: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
172: IS is1,is2;
174: PetscInt *vary,*iary,nrows,ncols,i,bs=A->rmap->bs,count,maxmnbs,j;
175: const PetscInt *irow,*icol;
178: ISGetIndices(isrow,&irow);
179: ISGetIndices(iscol,&icol);
180: ISGetLocalSize(isrow,&nrows);
181: ISGetLocalSize(iscol,&ncols);
183: /* Verify if the indices corespond to each element in a block
184: and form the IS with compressed IS */
185: maxmnbs = PetscMax(a->mbs,a->nbs);
186: PetscMalloc2(maxmnbs,&vary,maxmnbs,&iary);
187: PetscMemzero(vary,a->mbs*sizeof(PetscInt));
188: for (i=0; i<nrows; i++) vary[irow[i]/bs]++;
189: for (i=0; i<a->mbs; i++) {
190: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Index set does not match blocks");
191: }
192: count = 0;
193: for (i=0; i<nrows; i++) {
194: j = irow[i] / bs;
195: if ((vary[j]--)==bs) iary[count++] = j;
196: }
197: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is1);
199: PetscMemzero(vary,(a->nbs)*sizeof(PetscInt));
200: for (i=0; i<ncols; i++) vary[icol[i]/bs]++;
201: for (i=0; i<a->nbs; i++) {
202: if (vary[i]!=0 && vary[i]!=bs) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_PLIB,"Internal error in PETSc");
203: }
204: count = 0;
205: for (i=0; i<ncols; i++) {
206: j = icol[i] / bs;
207: if ((vary[j]--)==bs) iary[count++] = j;
208: }
209: ISCreateGeneral(PETSC_COMM_SELF,count,iary,PETSC_COPY_VALUES,&is2);
210: ISRestoreIndices(isrow,&irow);
211: ISRestoreIndices(iscol,&icol);
212: PetscFree2(vary,iary);
214: MatGetSubMatrix_SeqBAIJ_Private(A,is1,is2,scall,B);
215: ISDestroy(&is1);
216: ISDestroy(&is2);
217: return(0);
218: }
222: PetscErrorCode MatGetSubMatrices_SeqBAIJ(Mat A,PetscInt n,const IS irow[],const IS icol[],MatReuse scall,Mat *B[])
223: {
225: PetscInt i;
228: if (scall == MAT_INITIAL_MATRIX) {
229: PetscMalloc1(n+1,B);
230: }
232: for (i=0; i<n; i++) {
233: MatGetSubMatrix_SeqBAIJ(A,irow[i],icol[i],scall,&(*B)[i]);
234: }
235: return(0);
236: }
239: /* -------------------------------------------------------*/
240: /* Should check that shapes of vectors and matrices match */
241: /* -------------------------------------------------------*/
245: PetscErrorCode MatMult_SeqBAIJ_1(Mat A,Vec xx,Vec zz)
246: {
247: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
248: PetscScalar *z,sum;
249: const PetscScalar *x;
250: const MatScalar *v;
251: PetscErrorCode ierr;
252: PetscInt mbs,i,n;
253: const PetscInt *idx,*ii,*ridx=NULL;
254: PetscBool usecprow=a->compressedrow.use;
257: VecGetArrayRead(xx,&x);
258: VecGetArray(zz,&z);
260: if (usecprow) {
261: mbs = a->compressedrow.nrows;
262: ii = a->compressedrow.i;
263: ridx = a->compressedrow.rindex;
264: PetscMemzero(z,a->mbs*sizeof(PetscScalar));
265: } else {
266: mbs = a->mbs;
267: ii = a->i;
268: }
270: for (i=0; i<mbs; i++) {
271: n = ii[1] - ii[0];
272: v = a->a + ii[0];
273: idx = a->j + ii[0];
274: ii++;
275: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
276: PetscPrefetchBlock(v+1*n,1*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
277: sum = 0.0;
278: PetscSparseDensePlusDot(sum,x,v,idx,n);
279: if (usecprow) {
280: z[ridx[i]] = sum;
281: } else {
282: z[i] = sum;
283: }
284: }
285: VecRestoreArrayRead(xx,&x);
286: VecRestoreArray(zz,&z);
287: PetscLogFlops(2.0*a->nz - a->nonzerorowcnt);
288: return(0);
289: }
293: PetscErrorCode MatMult_SeqBAIJ_2(Mat A,Vec xx,Vec zz)
294: {
295: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
296: PetscScalar *z = 0,sum1,sum2,*zarray;
297: const PetscScalar *x,*xb;
298: PetscScalar x1,x2;
299: const MatScalar *v;
300: PetscErrorCode ierr;
301: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
302: PetscBool usecprow=a->compressedrow.use;
305: VecGetArrayRead(xx,&x);
306: VecGetArray(zz,&zarray);
308: idx = a->j;
309: v = a->a;
310: if (usecprow) {
311: mbs = a->compressedrow.nrows;
312: ii = a->compressedrow.i;
313: ridx = a->compressedrow.rindex;
314: PetscMemzero(zarray,2*a->mbs*sizeof(PetscScalar));
315: } else {
316: mbs = a->mbs;
317: ii = a->i;
318: z = zarray;
319: }
321: for (i=0; i<mbs; i++) {
322: n = ii[1] - ii[0]; ii++;
323: sum1 = 0.0; sum2 = 0.0;
324: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
325: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
326: for (j=0; j<n; j++) {
327: xb = x + 2*(*idx++); x1 = xb[0]; x2 = xb[1];
328: sum1 += v[0]*x1 + v[2]*x2;
329: sum2 += v[1]*x1 + v[3]*x2;
330: v += 4;
331: }
332: if (usecprow) z = zarray + 2*ridx[i];
333: z[0] = sum1; z[1] = sum2;
334: if (!usecprow) z += 2;
335: }
336: VecRestoreArrayRead(xx,&x);
337: VecRestoreArray(zz,&zarray);
338: PetscLogFlops(8.0*a->nz - 2.0*a->nonzerorowcnt);
339: return(0);
340: }
344: PetscErrorCode MatMult_SeqBAIJ_3(Mat A,Vec xx,Vec zz)
345: {
346: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
347: PetscScalar *z = 0,sum1,sum2,sum3,x1,x2,x3,*zarray;
348: const PetscScalar *x,*xb;
349: const MatScalar *v;
350: PetscErrorCode ierr;
351: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
352: PetscBool usecprow=a->compressedrow.use;
354: #if defined(PETSC_HAVE_PRAGMA_DISJOINT)
355: #pragma disjoint(*v,*z,*xb)
356: #endif
359: VecGetArrayRead(xx,&x);
360: VecGetArray(zz,&zarray);
362: idx = a->j;
363: v = a->a;
364: if (usecprow) {
365: mbs = a->compressedrow.nrows;
366: ii = a->compressedrow.i;
367: ridx = a->compressedrow.rindex;
368: PetscMemzero(zarray,3*a->mbs*sizeof(PetscScalar));
369: } else {
370: mbs = a->mbs;
371: ii = a->i;
372: z = zarray;
373: }
375: for (i=0; i<mbs; i++) {
376: n = ii[1] - ii[0]; ii++;
377: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0;
378: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
379: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
380: for (j=0; j<n; j++) {
381: xb = x + 3*(*idx++);
382: x1 = xb[0];
383: x2 = xb[1];
384: x3 = xb[2];
386: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
387: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
388: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
389: v += 9;
390: }
391: if (usecprow) z = zarray + 3*ridx[i];
392: z[0] = sum1; z[1] = sum2; z[2] = sum3;
393: if (!usecprow) z += 3;
394: }
395: VecRestoreArrayRead(xx,&x);
396: VecRestoreArray(zz,&zarray);
397: PetscLogFlops(18.0*a->nz - 3.0*a->nonzerorowcnt);
398: return(0);
399: }
403: PetscErrorCode MatMult_SeqBAIJ_4(Mat A,Vec xx,Vec zz)
404: {
405: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
406: PetscScalar *z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*zarray;
407: const PetscScalar *x,*xb;
408: const MatScalar *v;
409: PetscErrorCode ierr;
410: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
411: PetscBool usecprow=a->compressedrow.use;
414: VecGetArrayRead(xx,&x);
415: VecGetArray(zz,&zarray);
417: idx = a->j;
418: v = a->a;
419: if (usecprow) {
420: mbs = a->compressedrow.nrows;
421: ii = a->compressedrow.i;
422: ridx = a->compressedrow.rindex;
423: PetscMemzero(zarray,4*a->mbs*sizeof(PetscScalar));
424: } else {
425: mbs = a->mbs;
426: ii = a->i;
427: z = zarray;
428: }
430: for (i=0; i<mbs; i++) {
431: n = ii[1] - ii[0];
432: ii++;
433: sum1 = 0.0;
434: sum2 = 0.0;
435: sum3 = 0.0;
436: sum4 = 0.0;
438: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
439: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
440: for (j=0; j<n; j++) {
441: xb = x + 4*(*idx++);
442: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
443: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
444: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
445: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
446: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
447: v += 16;
448: }
449: if (usecprow) z = zarray + 4*ridx[i];
450: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
451: if (!usecprow) z += 4;
452: }
453: VecRestoreArrayRead(xx,&x);
454: VecRestoreArray(zz,&zarray);
455: PetscLogFlops(32.0*a->nz - 4.0*a->nonzerorowcnt);
456: return(0);
457: }
461: PetscErrorCode MatMult_SeqBAIJ_5(Mat A,Vec xx,Vec zz)
462: {
463: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
464: PetscScalar sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5,*z = 0,*zarray;
465: const PetscScalar *xb,*x;
466: const MatScalar *v;
467: PetscErrorCode ierr;
468: const PetscInt *idx,*ii,*ridx=NULL;
469: PetscInt mbs,i,j,n;
470: PetscBool usecprow=a->compressedrow.use;
473: VecGetArrayRead(xx,&x);
474: VecGetArray(zz,&zarray);
476: idx = a->j;
477: v = a->a;
478: if (usecprow) {
479: mbs = a->compressedrow.nrows;
480: ii = a->compressedrow.i;
481: ridx = a->compressedrow.rindex;
482: PetscMemzero(zarray,5*a->mbs*sizeof(PetscScalar));
483: } else {
484: mbs = a->mbs;
485: ii = a->i;
486: z = zarray;
487: }
489: for (i=0; i<mbs; i++) {
490: n = ii[1] - ii[0]; ii++;
491: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0;
492: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
493: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
494: for (j=0; j<n; j++) {
495: xb = x + 5*(*idx++);
496: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
497: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
498: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
499: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
500: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
501: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
502: v += 25;
503: }
504: if (usecprow) z = zarray + 5*ridx[i];
505: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
506: if (!usecprow) z += 5;
507: }
508: VecRestoreArrayRead(xx,&x);
509: VecRestoreArray(zz,&zarray);
510: PetscLogFlops(50.0*a->nz - 5.0*a->nonzerorowcnt);
511: return(0);
512: }
518: PetscErrorCode MatMult_SeqBAIJ_6(Mat A,Vec xx,Vec zz)
519: {
520: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
521: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
522: const PetscScalar *x,*xb;
523: PetscScalar x1,x2,x3,x4,x5,x6,*zarray;
524: const MatScalar *v;
525: PetscErrorCode ierr;
526: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
527: PetscBool usecprow=a->compressedrow.use;
530: VecGetArrayRead(xx,&x);
531: VecGetArray(zz,&zarray);
533: idx = a->j;
534: v = a->a;
535: if (usecprow) {
536: mbs = a->compressedrow.nrows;
537: ii = a->compressedrow.i;
538: ridx = a->compressedrow.rindex;
539: PetscMemzero(zarray,6*a->mbs*sizeof(PetscScalar));
540: } else {
541: mbs = a->mbs;
542: ii = a->i;
543: z = zarray;
544: }
546: for (i=0; i<mbs; i++) {
547: n = ii[1] - ii[0];
548: ii++;
549: sum1 = 0.0;
550: sum2 = 0.0;
551: sum3 = 0.0;
552: sum4 = 0.0;
553: sum5 = 0.0;
554: sum6 = 0.0;
556: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
557: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
558: for (j=0; j<n; j++) {
559: xb = x + 6*(*idx++);
560: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
561: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
562: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
563: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
564: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
565: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
566: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
567: v += 36;
568: }
569: if (usecprow) z = zarray + 6*ridx[i];
570: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
571: if (!usecprow) z += 6;
572: }
574: VecRestoreArrayRead(xx,&x);
575: VecRestoreArray(zz,&zarray);
576: PetscLogFlops(72.0*a->nz - 6.0*a->nonzerorowcnt);
577: return(0);
578: }
582: PetscErrorCode MatMult_SeqBAIJ_7(Mat A,Vec xx,Vec zz)
583: {
584: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
585: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
586: const PetscScalar *x,*xb;
587: PetscScalar x1,x2,x3,x4,x5,x6,x7,*zarray;
588: const MatScalar *v;
589: PetscErrorCode ierr;
590: PetscInt mbs,i,*idx,*ii,j,n,*ridx=NULL;
591: PetscBool usecprow=a->compressedrow.use;
594: VecGetArrayRead(xx,&x);
595: VecGetArray(zz,&zarray);
597: idx = a->j;
598: v = a->a;
599: if (usecprow) {
600: mbs = a->compressedrow.nrows;
601: ii = a->compressedrow.i;
602: ridx = a->compressedrow.rindex;
603: PetscMemzero(zarray,7*a->mbs*sizeof(PetscScalar));
604: } else {
605: mbs = a->mbs;
606: ii = a->i;
607: z = zarray;
608: }
610: for (i=0; i<mbs; i++) {
611: n = ii[1] - ii[0];
612: ii++;
613: sum1 = 0.0;
614: sum2 = 0.0;
615: sum3 = 0.0;
616: sum4 = 0.0;
617: sum5 = 0.0;
618: sum6 = 0.0;
619: sum7 = 0.0;
621: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
622: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
623: for (j=0; j<n; j++) {
624: xb = x + 7*(*idx++);
625: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
626: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
627: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
628: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
629: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
630: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
631: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
632: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
633: v += 49;
634: }
635: if (usecprow) z = zarray + 7*ridx[i];
636: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
637: if (!usecprow) z += 7;
638: }
640: VecRestoreArrayRead(xx,&x);
641: VecRestoreArray(zz,&zarray);
642: PetscLogFlops(98.0*a->nz - 7.0*a->nonzerorowcnt);
643: return(0);
644: }
646: /* MatMult_SeqBAIJ_15 version 1: Columns in the block are accessed one at a time */
647: /* Default MatMult for block size 15 */
651: PetscErrorCode MatMult_SeqBAIJ_15_ver1(Mat A,Vec xx,Vec zz)
652: {
653: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
654: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
655: const PetscScalar *x,*xb;
656: PetscScalar *zarray,xv;
657: const MatScalar *v;
658: PetscErrorCode ierr;
659: const PetscInt *ii,*ij=a->j,*idx;
660: PetscInt mbs,i,j,k,n,*ridx=NULL;
661: PetscBool usecprow=a->compressedrow.use;
664: VecGetArrayRead(xx,&x);
665: VecGetArray(zz,&zarray);
667: v = a->a;
668: if (usecprow) {
669: mbs = a->compressedrow.nrows;
670: ii = a->compressedrow.i;
671: ridx = a->compressedrow.rindex;
672: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
673: } else {
674: mbs = a->mbs;
675: ii = a->i;
676: z = zarray;
677: }
679: for (i=0; i<mbs; i++) {
680: n = ii[i+1] - ii[i];
681: idx = ij + ii[i];
682: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
683: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
685: for (j=0; j<n; j++) {
686: xb = x + 15*(idx[j]);
688: for (k=0; k<15; k++) {
689: xv = xb[k];
690: sum1 += v[0]*xv;
691: sum2 += v[1]*xv;
692: sum3 += v[2]*xv;
693: sum4 += v[3]*xv;
694: sum5 += v[4]*xv;
695: sum6 += v[5]*xv;
696: sum7 += v[6]*xv;
697: sum8 += v[7]*xv;
698: sum9 += v[8]*xv;
699: sum10 += v[9]*xv;
700: sum11 += v[10]*xv;
701: sum12 += v[11]*xv;
702: sum13 += v[12]*xv;
703: sum14 += v[13]*xv;
704: sum15 += v[14]*xv;
705: v += 15;
706: }
707: }
708: if (usecprow) z = zarray + 15*ridx[i];
709: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
710: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
712: if (!usecprow) z += 15;
713: }
715: VecRestoreArrayRead(xx,&x);
716: VecRestoreArray(zz,&zarray);
717: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
718: return(0);
719: }
721: /* MatMult_SeqBAIJ_15_ver2 : Columns in the block are accessed in sets of 4,4,4,3 */
724: PetscErrorCode MatMult_SeqBAIJ_15_ver2(Mat A,Vec xx,Vec zz)
725: {
726: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
727: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
728: const PetscScalar *x,*xb;
729: PetscScalar x1,x2,x3,x4,*zarray;
730: const MatScalar *v;
731: PetscErrorCode ierr;
732: const PetscInt *ii,*ij=a->j,*idx;
733: PetscInt mbs,i,j,n,*ridx=NULL;
734: PetscBool usecprow=a->compressedrow.use;
737: VecGetArrayRead(xx,&x);
738: VecGetArray(zz,&zarray);
740: v = a->a;
741: if (usecprow) {
742: mbs = a->compressedrow.nrows;
743: ii = a->compressedrow.i;
744: ridx = a->compressedrow.rindex;
745: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
746: } else {
747: mbs = a->mbs;
748: ii = a->i;
749: z = zarray;
750: }
752: for (i=0; i<mbs; i++) {
753: n = ii[i+1] - ii[i];
754: idx = ij + ii[i];
755: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
756: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
758: for (j=0; j<n; j++) {
759: xb = x + 15*(idx[j]);
760: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
762: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
763: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
764: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
765: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
766: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
767: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
768: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
769: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
770: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
771: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
772: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
773: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
774: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
775: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
776: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
778: v += 60;
780: x1 = xb[4]; x2 = xb[5]; x3 = xb[6]; x4 = xb[7];
782: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
783: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
784: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
785: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
786: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
787: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
788: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
789: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
790: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
791: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
792: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
793: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
794: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
795: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
796: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
797: v += 60;
799: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11];
800: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4;
801: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4;
802: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4;
803: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4;
804: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4;
805: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4;
806: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4;
807: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4;
808: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4;
809: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4;
810: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4;
811: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4;
812: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4;
813: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4;
814: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4;
815: v += 60;
817: x1 = xb[12]; x2 = xb[13]; x3 = xb[14];
818: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3;
819: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3;
820: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3;
821: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3;
822: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3;
823: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3;
824: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3;
825: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3;
826: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3;
827: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3;
828: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3;
829: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3;
830: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3;
831: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3;
832: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3;
833: v += 45;
834: }
835: if (usecprow) z = zarray + 15*ridx[i];
836: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
837: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
839: if (!usecprow) z += 15;
840: }
842: VecRestoreArrayRead(xx,&x);
843: VecRestoreArray(zz,&zarray);
844: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
845: return(0);
846: }
848: /* MatMult_SeqBAIJ_15_ver3 : Columns in the block are accessed in sets of 8,7 */
851: PetscErrorCode MatMult_SeqBAIJ_15_ver3(Mat A,Vec xx,Vec zz)
852: {
853: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
854: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
855: const PetscScalar *x,*xb;
856: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,*zarray;
857: const MatScalar *v;
858: PetscErrorCode ierr;
859: const PetscInt *ii,*ij=a->j,*idx;
860: PetscInt mbs,i,j,n,*ridx=NULL;
861: PetscBool usecprow=a->compressedrow.use;
864: VecGetArrayRead(xx,&x);
865: VecGetArray(zz,&zarray);
867: v = a->a;
868: if (usecprow) {
869: mbs = a->compressedrow.nrows;
870: ii = a->compressedrow.i;
871: ridx = a->compressedrow.rindex;
872: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
873: } else {
874: mbs = a->mbs;
875: ii = a->i;
876: z = zarray;
877: }
879: for (i=0; i<mbs; i++) {
880: n = ii[i+1] - ii[i];
881: idx = ij + ii[i];
882: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
883: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
885: for (j=0; j<n; j++) {
886: xb = x + 15*(idx[j]);
887: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
888: x8 = xb[7];
890: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8;
891: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8;
892: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8;
893: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8;
894: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8;
895: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8;
896: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8;
897: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8;
898: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8;
899: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8;
900: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8;
901: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8;
902: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8;
903: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8;
904: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8;
905: v += 120;
907: x1 = xb[8]; x2 = xb[9]; x3 = xb[10]; x4 = xb[11]; x5 = xb[12]; x6 = xb[13]; x7 = xb[14];
909: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7;
910: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7;
911: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7;
912: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7;
913: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7;
914: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7;
915: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7;
916: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7;
917: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7;
918: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7;
919: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7;
920: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7;
921: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7;
922: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7;
923: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7;
924: v += 105;
925: }
926: if (usecprow) z = zarray + 15*ridx[i];
927: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
928: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
930: if (!usecprow) z += 15;
931: }
933: VecRestoreArrayRead(xx,&x);
934: VecRestoreArray(zz,&zarray);
935: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
936: return(0);
937: }
939: /* MatMult_SeqBAIJ_15_ver4 : All columns in the block are accessed at once */
943: PetscErrorCode MatMult_SeqBAIJ_15_ver4(Mat A,Vec xx,Vec zz)
944: {
945: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
946: PetscScalar *z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7,sum8,sum9,sum10,sum11,sum12,sum13,sum14,sum15;
947: const PetscScalar *x,*xb;
948: PetscScalar x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13,x14,x15,*zarray;
949: const MatScalar *v;
950: PetscErrorCode ierr;
951: const PetscInt *ii,*ij=a->j,*idx;
952: PetscInt mbs,i,j,n,*ridx=NULL;
953: PetscBool usecprow=a->compressedrow.use;
956: VecGetArrayRead(xx,&x);
957: VecGetArray(zz,&zarray);
959: v = a->a;
960: if (usecprow) {
961: mbs = a->compressedrow.nrows;
962: ii = a->compressedrow.i;
963: ridx = a->compressedrow.rindex;
964: PetscMemzero(zarray,15*a->mbs*sizeof(PetscScalar));
965: } else {
966: mbs = a->mbs;
967: ii = a->i;
968: z = zarray;
969: }
971: for (i=0; i<mbs; i++) {
972: n = ii[i+1] - ii[i];
973: idx = ij + ii[i];
974: sum1 = 0.0; sum2 = 0.0; sum3 = 0.0; sum4 = 0.0; sum5 = 0.0; sum6 = 0.0; sum7 = 0.0;
975: sum8 = 0.0; sum9 = 0.0; sum10 = 0.0; sum11 = 0.0; sum12 = 0.0; sum13 = 0.0; sum14 = 0.0;sum15 = 0.0;
977: for (j=0; j<n; j++) {
978: xb = x + 15*(idx[j]);
979: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
980: x8 = xb[7]; x9 = xb[8]; x10 = xb[9]; x11 = xb[10]; x12 = xb[11]; x13 = xb[12]; x14 = xb[13];x15 = xb[14];
982: sum1 += v[0]*x1 + v[15]*x2 + v[30]*x3 + v[45]*x4 + v[60]*x5 + v[75]*x6 + v[90]*x7 + v[105]*x8 + v[120]*x9 + v[135]*x10 + v[150]*x11 + v[165]*x12 + v[180]*x13 + v[195]*x14 + v[210]*x15;
983: sum2 += v[1]*x1 + v[16]*x2 + v[31]*x3 + v[46]*x4 + v[61]*x5 + v[76]*x6 + v[91]*x7 + v[106]*x8 + v[121]*x9 + v[136]*x10 + v[151]*x11 + v[166]*x12 + v[181]*x13 + v[196]*x14 + v[211]*x15;
984: sum3 += v[2]*x1 + v[17]*x2 + v[32]*x3 + v[47]*x4 + v[62]*x5 + v[77]*x6 + v[92]*x7 + v[107]*x8 + v[122]*x9 + v[137]*x10 + v[152]*x11 + v[167]*x12 + v[182]*x13 + v[197]*x14 + v[212]*x15;
985: sum4 += v[3]*x1 + v[18]*x2 + v[33]*x3 + v[48]*x4 + v[63]*x5 + v[78]*x6 + v[93]*x7 + v[108]*x8 + v[123]*x9 + v[138]*x10 + v[153]*x11 + v[168]*x12 + v[183]*x13 + v[198]*x14 + v[213]*x15;
986: sum5 += v[4]*x1 + v[19]*x2 + v[34]*x3 + v[49]*x4 + v[64]*x5 + v[79]*x6 + v[94]*x7 + v[109]*x8 + v[124]*x9 + v[139]*x10 + v[154]*x11 + v[169]*x12 + v[184]*x13 + v[199]*x14 + v[214]*x15;
987: sum6 += v[5]*x1 + v[20]*x2 + v[35]*x3 + v[50]*x4 + v[65]*x5 + v[80]*x6 + v[95]*x7 + v[110]*x8 + v[125]*x9 + v[140]*x10 + v[155]*x11 + v[170]*x12 + v[185]*x13 + v[200]*x14 + v[215]*x15;
988: sum7 += v[6]*x1 + v[21]*x2 + v[36]*x3 + v[51]*x4 + v[66]*x5 + v[81]*x6 + v[96]*x7 + v[111]*x8 + v[126]*x9 + v[141]*x10 + v[156]*x11 + v[171]*x12 + v[186]*x13 + v[201]*x14 + v[216]*x15;
989: sum8 += v[7]*x1 + v[22]*x2 + v[37]*x3 + v[52]*x4 + v[67]*x5 + v[82]*x6 + v[97]*x7 + v[112]*x8 + v[127]*x9 + v[142]*x10 + v[157]*x11 + v[172]*x12 + v[187]*x13 + v[202]*x14 + v[217]*x15;
990: sum9 += v[8]*x1 + v[23]*x2 + v[38]*x3 + v[53]*x4 + v[68]*x5 + v[83]*x6 + v[98]*x7 + v[113]*x8 + v[128]*x9 + v[143]*x10 + v[158]*x11 + v[173]*x12 + v[188]*x13 + v[203]*x14 + v[218]*x15;
991: sum10 += v[9]*x1 + v[24]*x2 + v[39]*x3 + v[54]*x4 + v[69]*x5 + v[84]*x6 + v[99]*x7 + v[114]*x8 + v[129]*x9 + v[144]*x10 + v[159]*x11 + v[174]*x12 + v[189]*x13 + v[204]*x14 + v[219]*x15;
992: sum11 += v[10]*x1 + v[25]*x2 + v[40]*x3 + v[55]*x4 + v[70]*x5 + v[85]*x6 + v[100]*x7 + v[115]*x8 + v[130]*x9 + v[145]*x10 + v[160]*x11 + v[175]*x12 + v[190]*x13 + v[205]*x14 + v[220]*x15;
993: sum12 += v[11]*x1 + v[26]*x2 + v[41]*x3 + v[56]*x4 + v[71]*x5 + v[86]*x6 + v[101]*x7 + v[116]*x8 + v[131]*x9 + v[146]*x10 + v[161]*x11 + v[176]*x12 + v[191]*x13 + v[206]*x14 + v[221]*x15;
994: sum13 += v[12]*x1 + v[27]*x2 + v[42]*x3 + v[57]*x4 + v[72]*x5 + v[87]*x6 + v[102]*x7 + v[117]*x8 + v[132]*x9 + v[147]*x10 + v[162]*x11 + v[177]*x12 + v[192]*x13 + v[207]*x14 + v[222]*x15;
995: sum14 += v[13]*x1 + v[28]*x2 + v[43]*x3 + v[58]*x4 + v[73]*x5 + v[88]*x6 + v[103]*x7 + v[118]*x8 + v[133]*x9 + v[148]*x10 + v[163]*x11 + v[178]*x12 + v[193]*x13 + v[208]*x14 + v[223]*x15;
996: sum15 += v[14]*x1 + v[29]*x2 + v[44]*x3 + v[59]*x4 + v[74]*x5 + v[89]*x6 + v[104]*x7 + v[119]*x8 + v[134]*x9 + v[149]*x10 + v[164]*x11 + v[179]*x12 + v[194]*x13 + v[209]*x14 + v[224]*x15;
997: v += 225;
998: }
999: if (usecprow) z = zarray + 15*ridx[i];
1000: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1001: z[7] = sum8; z[8] = sum9; z[9] = sum10; z[10] = sum11; z[11] = sum12; z[12] = sum13; z[13] = sum14;z[14] = sum15;
1003: if (!usecprow) z += 15;
1004: }
1006: VecRestoreArrayRead(xx,&x);
1007: VecRestoreArray(zz,&zarray);
1008: PetscLogFlops(450.0*a->nz - 15.0*a->nonzerorowcnt);
1009: return(0);
1010: }
1013: /*
1014: This will not work with MatScalar == float because it calls the BLAS
1015: */
1018: PetscErrorCode MatMult_SeqBAIJ_N(Mat A,Vec xx,Vec zz)
1019: {
1020: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1021: PetscScalar *z = 0,*work,*workt,*zarray;
1022: const PetscScalar *x,*xb;
1023: const MatScalar *v;
1024: PetscErrorCode ierr;
1025: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1026: const PetscInt *idx,*ii,*ridx=NULL;
1027: PetscInt ncols,k;
1028: PetscBool usecprow=a->compressedrow.use;
1031: VecGetArrayRead(xx,&x);
1032: VecGetArray(zz,&zarray);
1034: idx = a->j;
1035: v = a->a;
1036: if (usecprow) {
1037: mbs = a->compressedrow.nrows;
1038: ii = a->compressedrow.i;
1039: ridx = a->compressedrow.rindex;
1040: PetscMemzero(zarray,bs*a->mbs*sizeof(PetscScalar));
1041: } else {
1042: mbs = a->mbs;
1043: ii = a->i;
1044: z = zarray;
1045: }
1047: if (!a->mult_work) {
1048: k = PetscMax(A->rmap->n,A->cmap->n);
1049: PetscMalloc1(k+1,&a->mult_work);
1050: }
1051: work = a->mult_work;
1052: for (i=0; i<mbs; i++) {
1053: n = ii[1] - ii[0]; ii++;
1054: ncols = n*bs;
1055: workt = work;
1056: for (j=0; j<n; j++) {
1057: xb = x + bs*(*idx++);
1058: for (k=0; k<bs; k++) workt[k] = xb[k];
1059: workt += bs;
1060: }
1061: if (usecprow) z = zarray + bs*ridx[i];
1062: PetscKernel_w_gets_Ar_times_v(bs,ncols,work,v,z);
1063: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DZero,z,&_One); */
1064: v += n*bs2;
1065: if (!usecprow) z += bs;
1066: }
1067: VecRestoreArrayRead(xx,&x);
1068: VecRestoreArray(zz,&zarray);
1069: PetscLogFlops(2.0*a->nz*bs2 - bs*a->nonzerorowcnt);
1070: return(0);
1071: }
1075: PetscErrorCode MatMultAdd_SeqBAIJ_1(Mat A,Vec xx,Vec yy,Vec zz)
1076: {
1077: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1078: const PetscScalar *x;
1079: PetscScalar *y,*z,sum;
1080: const MatScalar *v;
1081: PetscErrorCode ierr;
1082: PetscInt mbs=a->mbs,i,n,*ridx=NULL;
1083: const PetscInt *idx,*ii;
1084: PetscBool usecprow=a->compressedrow.use;
1087: VecGetArrayRead(xx,&x);
1088: VecGetArrayPair(yy,zz,&y,&z);
1090: idx = a->j;
1091: v = a->a;
1092: if (usecprow) {
1093: if (zz != yy) {
1094: PetscMemcpy(z,y,mbs*sizeof(PetscScalar));
1095: }
1096: mbs = a->compressedrow.nrows;
1097: ii = a->compressedrow.i;
1098: ridx = a->compressedrow.rindex;
1099: } else {
1100: ii = a->i;
1101: }
1103: for (i=0; i<mbs; i++) {
1104: n = ii[1] - ii[0];
1105: ii++;
1106: if (!usecprow) {
1107: sum = y[i];
1108: } else {
1109: sum = y[ridx[i]];
1110: }
1111: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1112: PetscPrefetchBlock(v+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1113: PetscSparseDensePlusDot(sum,x,v,idx,n);
1114: v += n;
1115: idx += n;
1116: if (usecprow) {
1117: z[ridx[i]] = sum;
1118: } else {
1119: z[i] = sum;
1120: }
1121: }
1122: VecRestoreArrayRead(xx,&x);
1123: VecRestoreArrayPair(yy,zz,&y,&z);
1124: PetscLogFlops(2.0*a->nz);
1125: return(0);
1126: }
1130: PetscErrorCode MatMultAdd_SeqBAIJ_2(Mat A,Vec xx,Vec yy,Vec zz)
1131: {
1132: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1133: PetscScalar *y = 0,*z = 0,sum1,sum2;
1134: const PetscScalar *x,*xb;
1135: PetscScalar x1,x2,*yarray,*zarray;
1136: const MatScalar *v;
1137: PetscErrorCode ierr;
1138: PetscInt mbs = a->mbs,i,n,j;
1139: const PetscInt *idx,*ii,*ridx = NULL;
1140: PetscBool usecprow = a->compressedrow.use;
1143: VecGetArrayRead(xx,&x);
1144: VecGetArrayPair(yy,zz,&yarray,&zarray);
1146: idx = a->j;
1147: v = a->a;
1148: if (usecprow) {
1149: if (zz != yy) {
1150: PetscMemcpy(zarray,yarray,2*mbs*sizeof(PetscScalar));
1151: }
1152: mbs = a->compressedrow.nrows;
1153: ii = a->compressedrow.i;
1154: ridx = a->compressedrow.rindex;
1155: } else {
1156: ii = a->i;
1157: y = yarray;
1158: z = zarray;
1159: }
1161: for (i=0; i<mbs; i++) {
1162: n = ii[1] - ii[0]; ii++;
1163: if (usecprow) {
1164: z = zarray + 2*ridx[i];
1165: y = yarray + 2*ridx[i];
1166: }
1167: sum1 = y[0]; sum2 = y[1];
1168: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1169: PetscPrefetchBlock(v+4*n,4*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1170: for (j=0; j<n; j++) {
1171: xb = x + 2*(*idx++);
1172: x1 = xb[0];
1173: x2 = xb[1];
1175: sum1 += v[0]*x1 + v[2]*x2;
1176: sum2 += v[1]*x1 + v[3]*x2;
1177: v += 4;
1178: }
1179: z[0] = sum1; z[1] = sum2;
1180: if (!usecprow) {
1181: z += 2; y += 2;
1182: }
1183: }
1184: VecRestoreArrayRead(xx,&x);
1185: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1186: PetscLogFlops(4.0*a->nz);
1187: return(0);
1188: }
1192: PetscErrorCode MatMultAdd_SeqBAIJ_3(Mat A,Vec xx,Vec yy,Vec zz)
1193: {
1194: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1195: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,x1,x2,x3,*yarray,*zarray;
1196: const PetscScalar *x,*xb;
1197: const MatScalar *v;
1198: PetscErrorCode ierr;
1199: PetscInt mbs = a->mbs,i,j,n;
1200: const PetscInt *idx,*ii,*ridx = NULL;
1201: PetscBool usecprow = a->compressedrow.use;
1204: VecGetArrayRead(xx,&x);
1205: VecGetArrayPair(yy,zz,&yarray,&zarray);
1207: idx = a->j;
1208: v = a->a;
1209: if (usecprow) {
1210: if (zz != yy) {
1211: PetscMemcpy(zarray,yarray,3*mbs*sizeof(PetscScalar));
1212: }
1213: mbs = a->compressedrow.nrows;
1214: ii = a->compressedrow.i;
1215: ridx = a->compressedrow.rindex;
1216: } else {
1217: ii = a->i;
1218: y = yarray;
1219: z = zarray;
1220: }
1222: for (i=0; i<mbs; i++) {
1223: n = ii[1] - ii[0]; ii++;
1224: if (usecprow) {
1225: z = zarray + 3*ridx[i];
1226: y = yarray + 3*ridx[i];
1227: }
1228: sum1 = y[0]; sum2 = y[1]; sum3 = y[2];
1229: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1230: PetscPrefetchBlock(v+9*n,9*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1231: for (j=0; j<n; j++) {
1232: xb = x + 3*(*idx++); x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1233: sum1 += v[0]*x1 + v[3]*x2 + v[6]*x3;
1234: sum2 += v[1]*x1 + v[4]*x2 + v[7]*x3;
1235: sum3 += v[2]*x1 + v[5]*x2 + v[8]*x3;
1236: v += 9;
1237: }
1238: z[0] = sum1; z[1] = sum2; z[2] = sum3;
1239: if (!usecprow) {
1240: z += 3; y += 3;
1241: }
1242: }
1243: VecRestoreArrayRead(xx,&x);
1244: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1245: PetscLogFlops(18.0*a->nz);
1246: return(0);
1247: }
1251: PetscErrorCode MatMultAdd_SeqBAIJ_4(Mat A,Vec xx,Vec yy,Vec zz)
1252: {
1253: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1254: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,x1,x2,x3,x4,*yarray,*zarray;
1255: const PetscScalar *x,*xb;
1256: const MatScalar *v;
1257: PetscErrorCode ierr;
1258: PetscInt mbs = a->mbs,i,j,n;
1259: const PetscInt *idx,*ii,*ridx=NULL;
1260: PetscBool usecprow=a->compressedrow.use;
1263: VecGetArrayRead(xx,&x);
1264: VecGetArrayPair(yy,zz,&yarray,&zarray);
1266: idx = a->j;
1267: v = a->a;
1268: if (usecprow) {
1269: if (zz != yy) {
1270: PetscMemcpy(zarray,yarray,4*mbs*sizeof(PetscScalar));
1271: }
1272: mbs = a->compressedrow.nrows;
1273: ii = a->compressedrow.i;
1274: ridx = a->compressedrow.rindex;
1275: } else {
1276: ii = a->i;
1277: y = yarray;
1278: z = zarray;
1279: }
1281: for (i=0; i<mbs; i++) {
1282: n = ii[1] - ii[0]; ii++;
1283: if (usecprow) {
1284: z = zarray + 4*ridx[i];
1285: y = yarray + 4*ridx[i];
1286: }
1287: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3];
1288: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1289: PetscPrefetchBlock(v+16*n,16*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1290: for (j=0; j<n; j++) {
1291: xb = x + 4*(*idx++);
1292: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1293: sum1 += v[0]*x1 + v[4]*x2 + v[8]*x3 + v[12]*x4;
1294: sum2 += v[1]*x1 + v[5]*x2 + v[9]*x3 + v[13]*x4;
1295: sum3 += v[2]*x1 + v[6]*x2 + v[10]*x3 + v[14]*x4;
1296: sum4 += v[3]*x1 + v[7]*x2 + v[11]*x3 + v[15]*x4;
1297: v += 16;
1298: }
1299: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4;
1300: if (!usecprow) {
1301: z += 4; y += 4;
1302: }
1303: }
1304: VecRestoreArrayRead(xx,&x);
1305: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1306: PetscLogFlops(32.0*a->nz);
1307: return(0);
1308: }
1312: PetscErrorCode MatMultAdd_SeqBAIJ_5(Mat A,Vec xx,Vec yy,Vec zz)
1313: {
1314: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1315: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,x1,x2,x3,x4,x5;
1316: const PetscScalar *x,*xb;
1317: PetscScalar *yarray,*zarray;
1318: const MatScalar *v;
1319: PetscErrorCode ierr;
1320: PetscInt mbs = a->mbs,i,j,n;
1321: const PetscInt *idx,*ii,*ridx = NULL;
1322: PetscBool usecprow=a->compressedrow.use;
1325: VecGetArrayRead(xx,&x);
1326: VecGetArrayPair(yy,zz,&yarray,&zarray);
1328: idx = a->j;
1329: v = a->a;
1330: if (usecprow) {
1331: if (zz != yy) {
1332: PetscMemcpy(zarray,yarray,5*mbs*sizeof(PetscScalar));
1333: }
1334: mbs = a->compressedrow.nrows;
1335: ii = a->compressedrow.i;
1336: ridx = a->compressedrow.rindex;
1337: } else {
1338: ii = a->i;
1339: y = yarray;
1340: z = zarray;
1341: }
1343: for (i=0; i<mbs; i++) {
1344: n = ii[1] - ii[0]; ii++;
1345: if (usecprow) {
1346: z = zarray + 5*ridx[i];
1347: y = yarray + 5*ridx[i];
1348: }
1349: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4];
1350: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1351: PetscPrefetchBlock(v+25*n,25*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1352: for (j=0; j<n; j++) {
1353: xb = x + 5*(*idx++);
1354: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4];
1355: sum1 += v[0]*x1 + v[5]*x2 + v[10]*x3 + v[15]*x4 + v[20]*x5;
1356: sum2 += v[1]*x1 + v[6]*x2 + v[11]*x3 + v[16]*x4 + v[21]*x5;
1357: sum3 += v[2]*x1 + v[7]*x2 + v[12]*x3 + v[17]*x4 + v[22]*x5;
1358: sum4 += v[3]*x1 + v[8]*x2 + v[13]*x3 + v[18]*x4 + v[23]*x5;
1359: sum5 += v[4]*x1 + v[9]*x2 + v[14]*x3 + v[19]*x4 + v[24]*x5;
1360: v += 25;
1361: }
1362: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5;
1363: if (!usecprow) {
1364: z += 5; y += 5;
1365: }
1366: }
1367: VecRestoreArrayRead(xx,&x);
1368: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1369: PetscLogFlops(50.0*a->nz);
1370: return(0);
1371: }
1374: PetscErrorCode MatMultAdd_SeqBAIJ_6(Mat A,Vec xx,Vec yy,Vec zz)
1375: {
1376: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1377: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6;
1378: const PetscScalar *x,*xb;
1379: PetscScalar x1,x2,x3,x4,x5,x6,*yarray,*zarray;
1380: const MatScalar *v;
1381: PetscErrorCode ierr;
1382: PetscInt mbs = a->mbs,i,j,n;
1383: const PetscInt *idx,*ii,*ridx=NULL;
1384: PetscBool usecprow=a->compressedrow.use;
1387: VecGetArrayRead(xx,&x);
1388: VecGetArrayPair(yy,zz,&yarray,&zarray);
1390: idx = a->j;
1391: v = a->a;
1392: if (usecprow) {
1393: if (zz != yy) {
1394: PetscMemcpy(zarray,yarray,6*mbs*sizeof(PetscScalar));
1395: }
1396: mbs = a->compressedrow.nrows;
1397: ii = a->compressedrow.i;
1398: ridx = a->compressedrow.rindex;
1399: } else {
1400: ii = a->i;
1401: y = yarray;
1402: z = zarray;
1403: }
1405: for (i=0; i<mbs; i++) {
1406: n = ii[1] - ii[0]; ii++;
1407: if (usecprow) {
1408: z = zarray + 6*ridx[i];
1409: y = yarray + 6*ridx[i];
1410: }
1411: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5];
1412: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1413: PetscPrefetchBlock(v+36*n,36*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1414: for (j=0; j<n; j++) {
1415: xb = x + 6*(*idx++);
1416: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5];
1417: sum1 += v[0]*x1 + v[6]*x2 + v[12]*x3 + v[18]*x4 + v[24]*x5 + v[30]*x6;
1418: sum2 += v[1]*x1 + v[7]*x2 + v[13]*x3 + v[19]*x4 + v[25]*x5 + v[31]*x6;
1419: sum3 += v[2]*x1 + v[8]*x2 + v[14]*x3 + v[20]*x4 + v[26]*x5 + v[32]*x6;
1420: sum4 += v[3]*x1 + v[9]*x2 + v[15]*x3 + v[21]*x4 + v[27]*x5 + v[33]*x6;
1421: sum5 += v[4]*x1 + v[10]*x2 + v[16]*x3 + v[22]*x4 + v[28]*x5 + v[34]*x6;
1422: sum6 += v[5]*x1 + v[11]*x2 + v[17]*x3 + v[23]*x4 + v[29]*x5 + v[35]*x6;
1423: v += 36;
1424: }
1425: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6;
1426: if (!usecprow) {
1427: z += 6; y += 6;
1428: }
1429: }
1430: VecRestoreArrayRead(xx,&x);
1431: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1432: PetscLogFlops(72.0*a->nz);
1433: return(0);
1434: }
1438: PetscErrorCode MatMultAdd_SeqBAIJ_7(Mat A,Vec xx,Vec yy,Vec zz)
1439: {
1440: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1441: PetscScalar *y = 0,*z = 0,sum1,sum2,sum3,sum4,sum5,sum6,sum7;
1442: const PetscScalar *x,*xb;
1443: PetscScalar x1,x2,x3,x4,x5,x6,x7,*yarray,*zarray;
1444: const MatScalar *v;
1445: PetscErrorCode ierr;
1446: PetscInt mbs = a->mbs,i,j,n;
1447: const PetscInt *idx,*ii,*ridx = NULL;
1448: PetscBool usecprow=a->compressedrow.use;
1451: VecGetArrayRead(xx,&x);
1452: VecGetArrayPair(yy,zz,&yarray,&zarray);
1454: idx = a->j;
1455: v = a->a;
1456: if (usecprow) {
1457: if (zz != yy) {
1458: PetscMemcpy(zarray,yarray,7*mbs*sizeof(PetscScalar));
1459: }
1460: mbs = a->compressedrow.nrows;
1461: ii = a->compressedrow.i;
1462: ridx = a->compressedrow.rindex;
1463: } else {
1464: ii = a->i;
1465: y = yarray;
1466: z = zarray;
1467: }
1469: for (i=0; i<mbs; i++) {
1470: n = ii[1] - ii[0]; ii++;
1471: if (usecprow) {
1472: z = zarray + 7*ridx[i];
1473: y = yarray + 7*ridx[i];
1474: }
1475: sum1 = y[0]; sum2 = y[1]; sum3 = y[2]; sum4 = y[3]; sum5 = y[4]; sum6 = y[5]; sum7 = y[6];
1476: PetscPrefetchBlock(idx+n,n,0,PETSC_PREFETCH_HINT_NTA); /* Indices for the next row (assumes same size as this one) */
1477: PetscPrefetchBlock(v+49*n,49*n,0,PETSC_PREFETCH_HINT_NTA); /* Entries for the next row */
1478: for (j=0; j<n; j++) {
1479: xb = x + 7*(*idx++);
1480: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3]; x5 = xb[4]; x6 = xb[5]; x7 = xb[6];
1481: sum1 += v[0]*x1 + v[7]*x2 + v[14]*x3 + v[21]*x4 + v[28]*x5 + v[35]*x6 + v[42]*x7;
1482: sum2 += v[1]*x1 + v[8]*x2 + v[15]*x3 + v[22]*x4 + v[29]*x5 + v[36]*x6 + v[43]*x7;
1483: sum3 += v[2]*x1 + v[9]*x2 + v[16]*x3 + v[23]*x4 + v[30]*x5 + v[37]*x6 + v[44]*x7;
1484: sum4 += v[3]*x1 + v[10]*x2 + v[17]*x3 + v[24]*x4 + v[31]*x5 + v[38]*x6 + v[45]*x7;
1485: sum5 += v[4]*x1 + v[11]*x2 + v[18]*x3 + v[25]*x4 + v[32]*x5 + v[39]*x6 + v[46]*x7;
1486: sum6 += v[5]*x1 + v[12]*x2 + v[19]*x3 + v[26]*x4 + v[33]*x5 + v[40]*x6 + v[47]*x7;
1487: sum7 += v[6]*x1 + v[13]*x2 + v[20]*x3 + v[27]*x4 + v[34]*x5 + v[41]*x6 + v[48]*x7;
1488: v += 49;
1489: }
1490: z[0] = sum1; z[1] = sum2; z[2] = sum3; z[3] = sum4; z[4] = sum5; z[5] = sum6; z[6] = sum7;
1491: if (!usecprow) {
1492: z += 7; y += 7;
1493: }
1494: }
1495: VecRestoreArrayRead(xx,&x);
1496: VecRestoreArrayPair(yy,zz,&yarray,&zarray);
1497: PetscLogFlops(98.0*a->nz);
1498: return(0);
1499: }
1503: PetscErrorCode MatMultAdd_SeqBAIJ_N(Mat A,Vec xx,Vec yy,Vec zz)
1504: {
1505: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1506: PetscScalar *z = 0,*work,*workt,*zarray;
1507: const PetscScalar *x,*xb;
1508: const MatScalar *v;
1509: PetscErrorCode ierr;
1510: PetscInt mbs,i,bs=A->rmap->bs,j,n,bs2=a->bs2;
1511: PetscInt ncols,k;
1512: const PetscInt *ridx = NULL,*idx,*ii;
1513: PetscBool usecprow = a->compressedrow.use;
1516: VecCopy(yy,zz);
1517: VecGetArrayRead(xx,&x);
1518: VecGetArray(zz,&zarray);
1520: idx = a->j;
1521: v = a->a;
1522: if (usecprow) {
1523: mbs = a->compressedrow.nrows;
1524: ii = a->compressedrow.i;
1525: ridx = a->compressedrow.rindex;
1526: } else {
1527: mbs = a->mbs;
1528: ii = a->i;
1529: z = zarray;
1530: }
1532: if (!a->mult_work) {
1533: k = PetscMax(A->rmap->n,A->cmap->n);
1534: PetscMalloc1(k+1,&a->mult_work);
1535: }
1536: work = a->mult_work;
1537: for (i=0; i<mbs; i++) {
1538: n = ii[1] - ii[0]; ii++;
1539: ncols = n*bs;
1540: workt = work;
1541: for (j=0; j<n; j++) {
1542: xb = x + bs*(*idx++);
1543: for (k=0; k<bs; k++) workt[k] = xb[k];
1544: workt += bs;
1545: }
1546: if (usecprow) z = zarray + bs*ridx[i];
1547: PetscKernel_w_gets_w_plus_Ar_times_v(bs,ncols,work,v,z);
1548: /* BLASgemv_("N",&bs,&ncols,&_DOne,v,&bs,work,&_One,&_DOne,z,&_One); */
1549: v += n*bs2;
1550: if (!usecprow) z += bs;
1551: }
1552: VecRestoreArrayRead(xx,&x);
1553: VecRestoreArray(zz,&zarray);
1554: PetscLogFlops(2.0*a->nz*bs2);
1555: return(0);
1556: }
1560: PetscErrorCode MatMultHermitianTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1561: {
1562: PetscScalar zero = 0.0;
1566: VecSet(zz,zero);
1567: MatMultHermitianTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1568: return(0);
1569: }
1573: PetscErrorCode MatMultTranspose_SeqBAIJ(Mat A,Vec xx,Vec zz)
1574: {
1575: PetscScalar zero = 0.0;
1579: VecSet(zz,zero);
1580: MatMultTransposeAdd_SeqBAIJ(A,xx,zz,zz);
1581: return(0);
1582: }
1586: PetscErrorCode MatMultHermitianTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1587: {
1588: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1589: PetscScalar *z,x1,x2,x3,x4,x5;
1590: const PetscScalar *x,*xb = NULL;
1591: const MatScalar *v;
1592: PetscErrorCode ierr;
1593: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n;
1594: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1595: Mat_CompressedRow cprow = a->compressedrow;
1596: PetscBool usecprow = cprow.use;
1599: if (yy != zz) { VecCopy(yy,zz); }
1600: VecGetArrayRead(xx,&x);
1601: VecGetArray(zz,&z);
1603: idx = a->j;
1604: v = a->a;
1605: if (usecprow) {
1606: mbs = cprow.nrows;
1607: ii = cprow.i;
1608: ridx = cprow.rindex;
1609: } else {
1610: mbs=a->mbs;
1611: ii = a->i;
1612: xb = x;
1613: }
1615: switch (bs) {
1616: case 1:
1617: for (i=0; i<mbs; i++) {
1618: if (usecprow) xb = x + ridx[i];
1619: x1 = xb[0];
1620: ib = idx + ii[0];
1621: n = ii[1] - ii[0]; ii++;
1622: for (j=0; j<n; j++) {
1623: rval = ib[j];
1624: z[rval] += PetscConj(*v) * x1;
1625: v++;
1626: }
1627: if (!usecprow) xb++;
1628: }
1629: break;
1630: case 2:
1631: for (i=0; i<mbs; i++) {
1632: if (usecprow) xb = x + 2*ridx[i];
1633: x1 = xb[0]; x2 = xb[1];
1634: ib = idx + ii[0];
1635: n = ii[1] - ii[0]; ii++;
1636: for (j=0; j<n; j++) {
1637: rval = ib[j]*2;
1638: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2;
1639: z[rval++] += PetscConj(v[2])*x1 + PetscConj(v[3])*x2;
1640: v += 4;
1641: }
1642: if (!usecprow) xb += 2;
1643: }
1644: break;
1645: case 3:
1646: for (i=0; i<mbs; i++) {
1647: if (usecprow) xb = x + 3*ridx[i];
1648: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1649: ib = idx + ii[0];
1650: n = ii[1] - ii[0]; ii++;
1651: for (j=0; j<n; j++) {
1652: rval = ib[j]*3;
1653: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3;
1654: z[rval++] += PetscConj(v[3])*x1 + PetscConj(v[4])*x2 + PetscConj(v[5])*x3;
1655: z[rval++] += PetscConj(v[6])*x1 + PetscConj(v[7])*x2 + PetscConj(v[8])*x3;
1656: v += 9;
1657: }
1658: if (!usecprow) xb += 3;
1659: }
1660: break;
1661: case 4:
1662: for (i=0; i<mbs; i++) {
1663: if (usecprow) xb = x + 4*ridx[i];
1664: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1665: ib = idx + ii[0];
1666: n = ii[1] - ii[0]; ii++;
1667: for (j=0; j<n; j++) {
1668: rval = ib[j]*4;
1669: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4;
1670: z[rval++] += PetscConj(v[4])*x1 + PetscConj(v[5])*x2 + PetscConj(v[6])*x3 + PetscConj(v[7])*x4;
1671: z[rval++] += PetscConj(v[8])*x1 + PetscConj(v[9])*x2 + PetscConj(v[10])*x3 + PetscConj(v[11])*x4;
1672: z[rval++] += PetscConj(v[12])*x1 + PetscConj(v[13])*x2 + PetscConj(v[14])*x3 + PetscConj(v[15])*x4;
1673: v += 16;
1674: }
1675: if (!usecprow) xb += 4;
1676: }
1677: break;
1678: case 5:
1679: for (i=0; i<mbs; i++) {
1680: if (usecprow) xb = x + 5*ridx[i];
1681: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1682: x4 = xb[3]; x5 = xb[4];
1683: ib = idx + ii[0];
1684: n = ii[1] - ii[0]; ii++;
1685: for (j=0; j<n; j++) {
1686: rval = ib[j]*5;
1687: z[rval++] += PetscConj(v[0])*x1 + PetscConj(v[1])*x2 + PetscConj(v[2])*x3 + PetscConj(v[3])*x4 + PetscConj(v[4])*x5;
1688: z[rval++] += PetscConj(v[5])*x1 + PetscConj(v[6])*x2 + PetscConj(v[7])*x3 + PetscConj(v[8])*x4 + PetscConj(v[9])*x5;
1689: z[rval++] += PetscConj(v[10])*x1 + PetscConj(v[11])*x2 + PetscConj(v[12])*x3 + PetscConj(v[13])*x4 + PetscConj(v[14])*x5;
1690: z[rval++] += PetscConj(v[15])*x1 + PetscConj(v[16])*x2 + PetscConj(v[17])*x3 + PetscConj(v[18])*x4 + PetscConj(v[19])*x5;
1691: z[rval++] += PetscConj(v[20])*x1 + PetscConj(v[21])*x2 + PetscConj(v[22])*x3 + PetscConj(v[23])*x4 + PetscConj(v[24])*x5;
1692: v += 25;
1693: }
1694: if (!usecprow) xb += 5;
1695: }
1696: break;
1697: default: /* block sizes larger than 5 by 5 are handled by BLAS */
1698: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"block size larger than 5 is not supported yet");
1699: #if 0
1700: {
1701: PetscInt ncols,k,bs2=a->bs2;
1702: PetscScalar *work,*workt,zb;
1703: const PetscScalar *xtmp;
1704: if (!a->mult_work) {
1705: k = PetscMax(A->rmap->n,A->cmap->n);
1706: PetscMalloc1(k+1,&a->mult_work);
1707: }
1708: work = a->mult_work;
1709: xtmp = x;
1710: for (i=0; i<mbs; i++) {
1711: n = ii[1] - ii[0]; ii++;
1712: ncols = n*bs;
1713: PetscMemzero(work,ncols*sizeof(PetscScalar));
1714: if (usecprow) xtmp = x + bs*ridx[i];
1715: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1716: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1717: v += n*bs2;
1718: if (!usecprow) xtmp += bs;
1719: workt = work;
1720: for (j=0; j<n; j++) {
1721: zb = z + bs*(*idx++);
1722: for (k=0; k<bs; k++) zb[k] += workt[k] ;
1723: workt += bs;
1724: }
1725: }
1726: }
1727: #endif
1728: }
1729: VecRestoreArrayRead(xx,&x);
1730: VecRestoreArray(zz,&z);
1731: PetscLogFlops(2.0*a->nz*a->bs2);
1732: return(0);
1733: }
1737: PetscErrorCode MatMultTransposeAdd_SeqBAIJ(Mat A,Vec xx,Vec yy,Vec zz)
1738: {
1739: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1740: PetscScalar *zb,*z,x1,x2,x3,x4,x5;
1741: const PetscScalar *x,*xb = 0;
1742: const MatScalar *v;
1743: PetscErrorCode ierr;
1744: PetscInt mbs,i,rval,bs=A->rmap->bs,j,n,bs2=a->bs2;
1745: const PetscInt *idx,*ii,*ib,*ridx = NULL;
1746: Mat_CompressedRow cprow = a->compressedrow;
1747: PetscBool usecprow=cprow.use;
1750: if (yy != zz) { VecCopy(yy,zz); }
1751: VecGetArrayRead(xx,&x);
1752: VecGetArray(zz,&z);
1754: idx = a->j;
1755: v = a->a;
1756: if (usecprow) {
1757: mbs = cprow.nrows;
1758: ii = cprow.i;
1759: ridx = cprow.rindex;
1760: } else {
1761: mbs=a->mbs;
1762: ii = a->i;
1763: xb = x;
1764: }
1766: switch (bs) {
1767: case 1:
1768: for (i=0; i<mbs; i++) {
1769: if (usecprow) xb = x + ridx[i];
1770: x1 = xb[0];
1771: ib = idx + ii[0];
1772: n = ii[1] - ii[0]; ii++;
1773: for (j=0; j<n; j++) {
1774: rval = ib[j];
1775: z[rval] += *v * x1;
1776: v++;
1777: }
1778: if (!usecprow) xb++;
1779: }
1780: break;
1781: case 2:
1782: for (i=0; i<mbs; i++) {
1783: if (usecprow) xb = x + 2*ridx[i];
1784: x1 = xb[0]; x2 = xb[1];
1785: ib = idx + ii[0];
1786: n = ii[1] - ii[0]; ii++;
1787: for (j=0; j<n; j++) {
1788: rval = ib[j]*2;
1789: z[rval++] += v[0]*x1 + v[1]*x2;
1790: z[rval++] += v[2]*x1 + v[3]*x2;
1791: v += 4;
1792: }
1793: if (!usecprow) xb += 2;
1794: }
1795: break;
1796: case 3:
1797: for (i=0; i<mbs; i++) {
1798: if (usecprow) xb = x + 3*ridx[i];
1799: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1800: ib = idx + ii[0];
1801: n = ii[1] - ii[0]; ii++;
1802: for (j=0; j<n; j++) {
1803: rval = ib[j]*3;
1804: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3;
1805: z[rval++] += v[3]*x1 + v[4]*x2 + v[5]*x3;
1806: z[rval++] += v[6]*x1 + v[7]*x2 + v[8]*x3;
1807: v += 9;
1808: }
1809: if (!usecprow) xb += 3;
1810: }
1811: break;
1812: case 4:
1813: for (i=0; i<mbs; i++) {
1814: if (usecprow) xb = x + 4*ridx[i];
1815: x1 = xb[0]; x2 = xb[1]; x3 = xb[2]; x4 = xb[3];
1816: ib = idx + ii[0];
1817: n = ii[1] - ii[0]; ii++;
1818: for (j=0; j<n; j++) {
1819: rval = ib[j]*4;
1820: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4;
1821: z[rval++] += v[4]*x1 + v[5]*x2 + v[6]*x3 + v[7]*x4;
1822: z[rval++] += v[8]*x1 + v[9]*x2 + v[10]*x3 + v[11]*x4;
1823: z[rval++] += v[12]*x1 + v[13]*x2 + v[14]*x3 + v[15]*x4;
1824: v += 16;
1825: }
1826: if (!usecprow) xb += 4;
1827: }
1828: break;
1829: case 5:
1830: for (i=0; i<mbs; i++) {
1831: if (usecprow) xb = x + 5*ridx[i];
1832: x1 = xb[0]; x2 = xb[1]; x3 = xb[2];
1833: x4 = xb[3]; x5 = xb[4];
1834: ib = idx + ii[0];
1835: n = ii[1] - ii[0]; ii++;
1836: for (j=0; j<n; j++) {
1837: rval = ib[j]*5;
1838: z[rval++] += v[0]*x1 + v[1]*x2 + v[2]*x3 + v[3]*x4 + v[4]*x5;
1839: z[rval++] += v[5]*x1 + v[6]*x2 + v[7]*x3 + v[8]*x4 + v[9]*x5;
1840: z[rval++] += v[10]*x1 + v[11]*x2 + v[12]*x3 + v[13]*x4 + v[14]*x5;
1841: z[rval++] += v[15]*x1 + v[16]*x2 + v[17]*x3 + v[18]*x4 + v[19]*x5;
1842: z[rval++] += v[20]*x1 + v[21]*x2 + v[22]*x3 + v[23]*x4 + v[24]*x5;
1843: v += 25;
1844: }
1845: if (!usecprow) xb += 5;
1846: }
1847: break;
1848: default: { /* block sizes larger then 5 by 5 are handled by BLAS */
1849: PetscInt ncols,k;
1850: PetscScalar *work,*workt;
1851: const PetscScalar *xtmp;
1852: if (!a->mult_work) {
1853: k = PetscMax(A->rmap->n,A->cmap->n);
1854: PetscMalloc1(k+1,&a->mult_work);
1855: }
1856: work = a->mult_work;
1857: xtmp = x;
1858: for (i=0; i<mbs; i++) {
1859: n = ii[1] - ii[0]; ii++;
1860: ncols = n*bs;
1861: PetscMemzero(work,ncols*sizeof(PetscScalar));
1862: if (usecprow) xtmp = x + bs*ridx[i];
1863: PetscKernel_w_gets_w_plus_trans_Ar_times_v(bs,ncols,xtmp,v,work);
1864: /* BLASgemv_("T",&bs,&ncols,&_DOne,v,&bs,xtmp,&_One,&_DOne,work,&_One); */
1865: v += n*bs2;
1866: if (!usecprow) xtmp += bs;
1867: workt = work;
1868: for (j=0; j<n; j++) {
1869: zb = z + bs*(*idx++);
1870: for (k=0; k<bs; k++) zb[k] += workt[k];
1871: workt += bs;
1872: }
1873: }
1874: }
1875: }
1876: VecRestoreArrayRead(xx,&x);
1877: VecRestoreArray(zz,&z);
1878: PetscLogFlops(2.0*a->nz*a->bs2);
1879: return(0);
1880: }
1884: PetscErrorCode MatScale_SeqBAIJ(Mat inA,PetscScalar alpha)
1885: {
1886: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)inA->data;
1887: PetscInt totalnz = a->bs2*a->nz;
1888: PetscScalar oalpha = alpha;
1890: PetscBLASInt one = 1,tnz;
1893: PetscBLASIntCast(totalnz,&tnz);
1894: PetscStackCallBLAS("BLASscal",BLASscal_(&tnz,&oalpha,a->a,&one));
1895: PetscLogFlops(totalnz);
1896: return(0);
1897: }
1901: PetscErrorCode MatNorm_SeqBAIJ(Mat A,NormType type,PetscReal *norm)
1902: {
1904: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1905: MatScalar *v = a->a;
1906: PetscReal sum = 0.0;
1907: PetscInt i,j,k,bs=A->rmap->bs,nz=a->nz,bs2=a->bs2,k1;
1910: if (type == NORM_FROBENIUS) {
1911: for (i=0; i< bs2*nz; i++) {
1912: sum += PetscRealPart(PetscConj(*v)*(*v)); v++;
1913: }
1914: *norm = PetscSqrtReal(sum);
1915: PetscLogFlops(2*bs2*nz);
1916: } else if (type == NORM_1) { /* maximum column sum */
1917: PetscReal *tmp;
1918: PetscInt *bcol = a->j;
1919: PetscCalloc1(A->cmap->n+1,&tmp);
1920: for (i=0; i<nz; i++) {
1921: for (j=0; j<bs; j++) {
1922: k1 = bs*(*bcol) + j; /* column index */
1923: for (k=0; k<bs; k++) {
1924: tmp[k1] += PetscAbsScalar(*v); v++;
1925: }
1926: }
1927: bcol++;
1928: }
1929: *norm = 0.0;
1930: for (j=0; j<A->cmap->n; j++) {
1931: if (tmp[j] > *norm) *norm = tmp[j];
1932: }
1933: PetscFree(tmp);
1934: PetscLogFlops(PetscMax(bs2*nz-1,0));
1935: } else if (type == NORM_INFINITY) { /* maximum row sum */
1936: *norm = 0.0;
1937: for (k=0; k<bs; k++) {
1938: for (j=0; j<a->mbs; j++) {
1939: v = a->a + bs2*a->i[j] + k;
1940: sum = 0.0;
1941: for (i=0; i<a->i[j+1]-a->i[j]; i++) {
1942: for (k1=0; k1<bs; k1++) {
1943: sum += PetscAbsScalar(*v);
1944: v += bs;
1945: }
1946: }
1947: if (sum > *norm) *norm = sum;
1948: }
1949: }
1950: PetscLogFlops(PetscMax(bs2*nz-1,0));
1951: } else SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"No support for this norm yet");
1952: return(0);
1953: }
1958: PetscErrorCode MatEqual_SeqBAIJ(Mat A,Mat B,PetscBool * flg)
1959: {
1960: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data,*b = (Mat_SeqBAIJ*)B->data;
1964: /* If the matrix/block dimensions are not equal, or no of nonzeros or shift */
1965: if ((A->rmap->N != B->rmap->N) || (A->cmap->n != B->cmap->n) || (A->rmap->bs != B->rmap->bs)|| (a->nz != b->nz)) {
1966: *flg = PETSC_FALSE;
1967: return(0);
1968: }
1970: /* if the a->i are the same */
1971: PetscMemcmp(a->i,b->i,(a->mbs+1)*sizeof(PetscInt),flg);
1972: if (!*flg) return(0);
1974: /* if a->j are the same */
1975: PetscMemcmp(a->j,b->j,(a->nz)*sizeof(PetscInt),flg);
1976: if (!*flg) return(0);
1978: /* if a->a are the same */
1979: PetscMemcmp(a->a,b->a,(a->nz)*(A->rmap->bs)*(B->rmap->bs)*sizeof(PetscScalar),flg);
1980: return(0);
1982: }
1986: PetscErrorCode MatGetDiagonal_SeqBAIJ(Mat A,Vec v)
1987: {
1988: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
1990: PetscInt i,j,k,n,row,bs,*ai,*aj,ambs,bs2;
1991: PetscScalar *x,zero = 0.0;
1992: MatScalar *aa,*aa_j;
1995: if (A->factortype) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Not for factored matrix");
1996: bs = A->rmap->bs;
1997: aa = a->a;
1998: ai = a->i;
1999: aj = a->j;
2000: ambs = a->mbs;
2001: bs2 = a->bs2;
2003: VecSet(v,zero);
2004: VecGetArray(v,&x);
2005: VecGetLocalSize(v,&n);
2006: if (n != A->rmap->N) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Nonconforming matrix and vector");
2007: for (i=0; i<ambs; i++) {
2008: for (j=ai[i]; j<ai[i+1]; j++) {
2009: if (aj[j] == i) {
2010: row = i*bs;
2011: aa_j = aa+j*bs2;
2012: for (k=0; k<bs2; k+=(bs+1),row++) x[row] = aa_j[k];
2013: break;
2014: }
2015: }
2016: }
2017: VecRestoreArray(v,&x);
2018: return(0);
2019: }
2023: PetscErrorCode MatDiagonalScale_SeqBAIJ(Mat A,Vec ll,Vec rr)
2024: {
2025: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2026: const PetscScalar *l,*r,*li,*ri;
2027: PetscScalar x;
2028: MatScalar *aa, *v;
2029: PetscErrorCode ierr;
2030: PetscInt i,j,k,lm,rn,M,m,n,mbs,tmp,bs,bs2,iai;
2031: const PetscInt *ai,*aj;
2034: ai = a->i;
2035: aj = a->j;
2036: aa = a->a;
2037: m = A->rmap->n;
2038: n = A->cmap->n;
2039: bs = A->rmap->bs;
2040: mbs = a->mbs;
2041: bs2 = a->bs2;
2042: if (ll) {
2043: VecGetArrayRead(ll,&l);
2044: VecGetLocalSize(ll,&lm);
2045: if (lm != m) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Left scaling vector wrong length");
2046: for (i=0; i<mbs; i++) { /* for each block row */
2047: M = ai[i+1] - ai[i];
2048: li = l + i*bs;
2049: v = aa + bs2*ai[i];
2050: for (j=0; j<M; j++) { /* for each block */
2051: for (k=0; k<bs2; k++) {
2052: (*v++) *= li[k%bs];
2053: }
2054: }
2055: }
2056: VecRestoreArrayRead(ll,&l);
2057: PetscLogFlops(a->nz);
2058: }
2060: if (rr) {
2061: VecGetArrayRead(rr,&r);
2062: VecGetLocalSize(rr,&rn);
2063: if (rn != n) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_SIZ,"Right scaling vector wrong length");
2064: for (i=0; i<mbs; i++) { /* for each block row */
2065: iai = ai[i];
2066: M = ai[i+1] - iai;
2067: v = aa + bs2*iai;
2068: for (j=0; j<M; j++) { /* for each block */
2069: ri = r + bs*aj[iai+j];
2070: for (k=0; k<bs; k++) {
2071: x = ri[k];
2072: for (tmp=0; tmp<bs; tmp++) v[tmp] *= x;
2073: v += bs;
2074: }
2075: }
2076: }
2077: VecRestoreArrayRead(rr,&r);
2078: PetscLogFlops(a->nz);
2079: }
2080: return(0);
2081: }
2086: PetscErrorCode MatGetInfo_SeqBAIJ(Mat A,MatInfoType flag,MatInfo *info)
2087: {
2088: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2091: info->block_size = a->bs2;
2092: info->nz_allocated = a->bs2*a->maxnz;
2093: info->nz_used = a->bs2*a->nz;
2094: info->nz_unneeded = (double)(info->nz_allocated - info->nz_used);
2095: info->assemblies = A->num_ass;
2096: info->mallocs = A->info.mallocs;
2097: info->memory = ((PetscObject)A)->mem;
2098: if (A->factortype) {
2099: info->fill_ratio_given = A->info.fill_ratio_given;
2100: info->fill_ratio_needed = A->info.fill_ratio_needed;
2101: info->factor_mallocs = A->info.factor_mallocs;
2102: } else {
2103: info->fill_ratio_given = 0;
2104: info->fill_ratio_needed = 0;
2105: info->factor_mallocs = 0;
2106: }
2107: return(0);
2108: }
2112: PetscErrorCode MatZeroEntries_SeqBAIJ(Mat A)
2113: {
2114: Mat_SeqBAIJ *a = (Mat_SeqBAIJ*)A->data;
2118: PetscMemzero(a->a,a->bs2*a->i[a->mbs]*sizeof(MatScalar));
2119: return(0);
2120: }