Actual source code: bvlapack.c
slepc-3.19.0 2023-03-31
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV private kernels that use the LAPACK
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
17: /*
18: Reduction operation to compute sqrt(x**2+y**2) when normalizing vectors
19: */
20: SLEPC_EXTERN void MPIAPI SlepcPythag(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
21: {
22: PetscBLASInt i,n=*len;
23: PetscReal *x = (PetscReal*)in,*y = (PetscReal*)inout;
25: PetscFunctionBegin;
26: if (PetscUnlikely(*datatype!=MPIU_REAL)) {
27: (void)(*PetscErrorPrintf)("Only implemented for MPIU_REAL data type");
28: MPI_Abort(PETSC_COMM_WORLD,1);
29: }
30: for (i=0;i<n;i++) y[i] = SlepcAbs(x[i],y[i]);
31: PetscFunctionReturnVoid();
32: }
34: /*
35: Compute ||A|| for an mxn matrix
36: */
37: PetscErrorCode BVNorm_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,NormType type,PetscReal *nrm,PetscBool mpi)
38: {
39: PetscBLASInt m,n,i,j;
40: PetscMPIInt len;
41: PetscReal lnrm,*rwork=NULL,*rwork2=NULL;
43: PetscFunctionBegin;
44: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
45: PetscCall(PetscBLASIntCast(m_,&m));
46: PetscCall(PetscBLASIntCast(n_,&n));
47: if (type==NORM_FROBENIUS || type==NORM_2) {
48: lnrm = LAPACKlange_("F",&m,&n,(PetscScalar*)A,&m,rwork);
49: if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
50: else *nrm = lnrm;
51: PetscCall(PetscLogFlops(2.0*m*n));
52: } else if (type==NORM_1) {
53: if (mpi) {
54: PetscCall(BVAllocateWork_Private(bv,2*n_));
55: rwork = (PetscReal*)bv->work;
56: rwork2 = rwork+n_;
57: PetscCall(PetscArrayzero(rwork,n_));
58: PetscCall(PetscArrayzero(rwork2,n_));
59: for (j=0;j<n_;j++) {
60: for (i=0;i<m_;i++) {
61: rwork[j] += PetscAbsScalar(A[i+j*m_]);
62: }
63: }
64: PetscCall(PetscMPIIntCast(n_,&len));
65: PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_SUM,PetscObjectComm((PetscObject)bv)));
66: *nrm = 0.0;
67: for (j=0;j<n_;j++) if (rwork2[j] > *nrm) *nrm = rwork2[j];
68: } else {
69: *nrm = LAPACKlange_("O",&m,&n,(PetscScalar*)A,&m,rwork);
70: }
71: PetscCall(PetscLogFlops(1.0*m*n));
72: } else if (type==NORM_INFINITY) {
73: PetscCall(BVAllocateWork_Private(bv,m_));
74: rwork = (PetscReal*)bv->work;
75: lnrm = LAPACKlange_("I",&m,&n,(PetscScalar*)A,&m,rwork);
76: if (mpi) PetscCall(MPIU_Allreduce(&lnrm,nrm,1,MPIU_REAL,MPIU_MAX,PetscObjectComm((PetscObject)bv)));
77: else *nrm = lnrm;
78: PetscCall(PetscLogFlops(1.0*m*n));
79: }
80: PetscCall(PetscFPTrapPop());
81: PetscFunctionReturn(PETSC_SUCCESS);
82: }
84: /*
85: Normalize the columns of an mxn matrix A
86: */
87: PetscErrorCode BVNormalize_LAPACK_Private(BV bv,PetscInt m_,PetscInt n_,const PetscScalar *A,PetscScalar *eigi,PetscBool mpi)
88: {
89: PetscBLASInt m,j,k,info,zero=0;
90: PetscMPIInt len;
91: PetscReal *norms,*rwork=NULL,*rwork2=NULL,done=1.0;
93: PetscFunctionBegin;
94: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
95: PetscCall(PetscBLASIntCast(m_,&m));
96: PetscCall(BVAllocateWork_Private(bv,2*n_));
97: rwork = (PetscReal*)bv->work;
98: rwork2 = rwork+n_;
99: /* compute local norms */
100: for (j=0;j<n_;j++) {
101: k = 1;
102: #if !defined(PETSC_USE_COMPLEX)
103: if (eigi && eigi[j] != 0.0) k = 2;
104: #endif
105: rwork[j] = LAPACKlange_("F",&m,&k,(PetscScalar*)(A+j*m_),&m,rwork2);
106: if (k==2) { rwork[j+1] = rwork[j]; j++; }
107: }
108: /* reduction to get global norms */
109: if (mpi) {
110: PetscCall(PetscMPIIntCast(n_,&len));
111: PetscCall(PetscArrayzero(rwork2,n_));
112: PetscCall(MPIU_Allreduce(rwork,rwork2,len,MPIU_REAL,MPIU_LAPY2,PetscObjectComm((PetscObject)bv)));
113: norms = rwork2;
114: } else norms = rwork;
115: /* scale columns */
116: for (j=0;j<n_;j++) {
117: k = 1;
118: #if !defined(PETSC_USE_COMPLEX)
119: if (eigi && eigi[j] != 0.0) k = 2;
120: #endif
121: PetscCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,norms+j,&done,&m,&k,(PetscScalar*)(A+j*m_),&m,&info));
122: SlepcCheckLapackInfo("lascl",info);
123: if (k==2) j++;
124: }
125: PetscCall(PetscLogFlops(3.0*m*n_));
126: PetscCall(PetscFPTrapPop());
127: PetscFunctionReturn(PETSC_SUCCESS);
128: }
130: /*
131: Compute the upper Cholesky factor in R and its inverse in S.
132: If S == R then the inverse overwrites the Cholesky factor.
133: */
134: PetscErrorCode BVMatCholInv_LAPACK_Private(BV bv,Mat R,Mat S)
135: {
136: PetscInt i,k,l,n,m,ld,lds;
137: PetscScalar *pR,*pS;
138: PetscBLASInt info,n_ = 0,m_ = 0,ld_,lds_;
140: PetscFunctionBegin;
141: l = bv->l;
142: k = bv->k;
143: PetscCall(MatGetSize(R,&m,NULL));
144: n = k-l;
145: PetscCall(PetscBLASIntCast(m,&m_));
146: PetscCall(PetscBLASIntCast(n,&n_));
147: ld = m;
148: ld_ = m_;
149: PetscCall(MatDenseGetArray(R,&pR));
151: if (S==R) {
152: PetscCall(BVAllocateWork_Private(bv,m*k));
153: pS = bv->work;
154: lds = ld;
155: lds_ = ld_;
156: } else {
157: PetscCall(MatDenseGetArray(S,&pS));
158: PetscCall(MatGetSize(S,&lds,NULL));
159: PetscCall(PetscBLASIntCast(lds,&lds_));
160: }
162: /* save a copy of matrix in S */
163: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
165: /* compute upper Cholesky factor in R */
166: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
167: PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
168: PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
170: if (info) { /* LAPACKpotrf failed, retry on diagonally perturbed matrix */
171: for (i=l;i<k;i++) {
172: PetscCall(PetscArraycpy(pR+i*ld+l,pS+i*lds+l,n));
173: pR[i+i*ld] += 50.0*PETSC_MACHINE_EPSILON;
174: }
175: PetscCallBLAS("LAPACKpotrf",LAPACKpotrf_("U",&n_,pR+l*ld+l,&ld_,&info));
176: SlepcCheckLapackInfo("potrf",info);
177: PetscCall(PetscLogFlops((1.0*n*n*n)/3.0));
178: }
180: /* compute S = inv(R) */
181: if (S==R) {
182: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
183: } else {
184: PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
185: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
186: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
187: }
188: SlepcCheckLapackInfo("trtri",info);
189: PetscCall(PetscFPTrapPop());
190: PetscCall(PetscLogFlops(0.33*n*n*n));
192: /* Zero out entries below the diagonal */
193: for (i=l;i<k-1;i++) {
194: PetscCall(PetscArrayzero(pR+i*ld+i+1,(k-i-1)));
195: if (S!=R) PetscCall(PetscArrayzero(pS+i*lds+i+1,(k-i-1)));
196: }
197: PetscCall(MatDenseRestoreArray(R,&pR));
198: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
199: PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: /*
203: Compute the inverse of an upper triangular matrix R, store it in S.
204: If S == R then the inverse overwrites R.
205: */
206: PetscErrorCode BVMatTriInv_LAPACK_Private(BV bv,Mat R,Mat S)
207: {
208: PetscInt i,k,l,n,m,ld,lds;
209: PetscScalar *pR,*pS;
210: PetscBLASInt info,n_,m_ = 0,ld_,lds_;
212: PetscFunctionBegin;
213: l = bv->l;
214: k = bv->k;
215: PetscCall(MatGetSize(R,&m,NULL));
216: n = k-l;
217: PetscCall(PetscBLASIntCast(m,&m_));
218: PetscCall(PetscBLASIntCast(n,&n_));
219: ld = m;
220: ld_ = m_;
221: PetscCall(MatDenseGetArray(R,&pR));
223: if (S==R) {
224: PetscCall(BVAllocateWork_Private(bv,m*k));
225: pS = bv->work;
226: lds = ld;
227: lds_ = ld_;
228: } else {
229: PetscCall(MatDenseGetArray(S,&pS));
230: PetscCall(MatGetSize(S,&lds,NULL));
231: PetscCall(PetscBLASIntCast(lds,&lds_));
232: }
234: /* compute S = inv(R) */
235: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
236: if (S==R) {
237: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pR+l*ld+l,&ld_,&info));
238: } else {
239: PetscCall(PetscArrayzero(pS+l*lds,(k-l)*k));
240: for (i=l;i<k;i++) PetscCall(PetscArraycpy(pS+i*lds+l,pR+i*ld+l,n));
241: PetscCallBLAS("LAPACKtrtri",LAPACKtrtri_("U","N",&n_,pS+l*lds+l,&lds_,&info));
242: }
243: SlepcCheckLapackInfo("trtri",info);
244: PetscCall(PetscFPTrapPop());
245: PetscCall(PetscLogFlops(0.33*n*n*n));
247: PetscCall(MatDenseRestoreArray(R,&pR));
248: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
249: PetscFunctionReturn(PETSC_SUCCESS);
250: }
252: /*
253: Compute the matrix to be used for post-multiplying the basis in the SVQB
254: block orthogonalization method.
255: On input R = V'*V, on output S = D*U*Lambda^{-1/2} where (U,Lambda) is
256: the eigendecomposition of D*R*D with D=diag(R)^{-1/2}.
257: If S == R then the result overwrites R.
258: */
259: PetscErrorCode BVMatSVQB_LAPACK_Private(BV bv,Mat R,Mat S)
260: {
261: PetscInt i,j,k,l,n,m,ld,lds;
262: PetscScalar *pR,*pS,*D,*work,a;
263: PetscReal *eig,dummy;
264: PetscBLASInt info,lwork,n_,m_ = 0,ld_,lds_;
265: #if defined(PETSC_USE_COMPLEX)
266: PetscReal *rwork,rdummy;
267: #endif
269: PetscFunctionBegin;
270: l = bv->l;
271: k = bv->k;
272: PetscCall(MatGetSize(R,&m,NULL));
273: PetscCall(MatDenseGetLDA(R,&ld));
274: n = k-l;
275: PetscCall(PetscBLASIntCast(m,&m_));
276: PetscCall(PetscBLASIntCast(n,&n_));
277: ld_ = m_;
278: PetscCall(MatDenseGetArray(R,&pR));
280: if (S==R) {
281: pS = pR;
282: lds = ld;
283: lds_ = ld_;
284: } else {
285: PetscCall(MatDenseGetArray(S,&pS));
286: PetscCall(MatDenseGetLDA(S,&lds));
287: PetscCall(PetscBLASIntCast(lds,&lds_));
288: }
289: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
291: /* workspace query and memory allocation */
292: lwork = -1;
293: #if defined(PETSC_USE_COMPLEX)
294: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&rdummy,&info));
295: PetscCall(PetscBLASIntCast((PetscInt)PetscRealPart(a),&lwork));
296: PetscCall(PetscMalloc4(n,&eig,n,&D,lwork,&work,PetscMax(1,3*n-2),&rwork));
297: #else
298: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS,&lds_,&dummy,&a,&lwork,&info));
299: PetscCall(PetscBLASIntCast((PetscInt)a,&lwork));
300: PetscCall(PetscMalloc3(n,&eig,n,&D,lwork,&work));
301: #endif
303: /* copy and scale matrix */
304: for (i=l;i<k;i++) D[i-l] = 1.0/PetscSqrtReal(PetscRealPart(pR[i+i*ld]));
305: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] = pR[i+j*ld]*D[i-l];
306: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] *= D[j-l];
308: /* compute eigendecomposition */
309: #if defined(PETSC_USE_COMPLEX)
310: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,rwork,&info));
311: #else
312: PetscCallBLAS("LAPACKsyev",LAPACKsyev_("V","L",&n_,pS+l*lds+l,&lds_,eig,work,&lwork,&info));
313: #endif
314: SlepcCheckLapackInfo("syev",info);
316: if (S!=R) { /* R = U' */
317: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] = pS[j+i*lds];
318: }
320: /* compute S = D*U*Lambda^{-1/2} */
321: for (i=l;i<k;i++) for (j=l;j<k;j++) pS[i+j*lds] *= D[i-l];
322: for (j=l;j<k;j++) for (i=l;i<k;i++) pS[i+j*lds] /= PetscSqrtReal(eig[j-l]);
324: if (S!=R) { /* compute R = inv(S) = Lambda^{1/2}*U'/D */
325: for (i=l;i<k;i++) for (j=l;j<k;j++) pR[i+j*ld] *= PetscSqrtReal(eig[i-l]);
326: for (j=l;j<k;j++) for (i=l;i<k;i++) pR[i+j*ld] /= D[j-l];
327: }
329: #if defined(PETSC_USE_COMPLEX)
330: PetscCall(PetscFree4(eig,D,work,rwork));
331: #else
332: PetscCall(PetscFree3(eig,D,work));
333: #endif
334: PetscCall(PetscLogFlops(9.0*n*n*n));
335: PetscCall(PetscFPTrapPop());
337: PetscCall(MatDenseRestoreArray(R,&pR));
338: if (S!=R) PetscCall(MatDenseRestoreArray(S,&pS));
339: PetscFunctionReturn(PETSC_SUCCESS);
340: }
342: /*
343: QR factorization of an mxn matrix via parallel TSQR
344: */
345: PetscErrorCode BVOrthogonalize_LAPACK_TSQR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
346: {
347: PetscInt level,plevel,nlevels,powtwo,lda,worklen;
348: PetscBLASInt m,n,i,j,k,l,s = 0,nb,sz,lwork,info;
349: PetscScalar *tau,*work,*A=NULL,*QQ=NULL,*Qhalf,*C=NULL,one=1.0,zero=0.0;
350: PetscMPIInt rank,size,count,stride;
351: MPI_Datatype tmat;
353: PetscFunctionBegin;
354: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
355: PetscCall(PetscBLASIntCast(m_,&m));
356: PetscCall(PetscBLASIntCast(n_,&n));
357: k = PetscMin(m,n);
358: nb = 16;
359: lda = 2*n;
360: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
361: PetscCallMPI(MPI_Comm_rank(PetscObjectComm((PetscObject)bv),&rank));
362: nlevels = (PetscInt)PetscCeilReal(PetscLog2Real((PetscReal)size));
363: powtwo = PetscPowInt(2,(PetscInt)PetscFloorReal(PetscLog2Real((PetscReal)size)));
364: worklen = n+n*nb;
365: if (nlevels) worklen += n*lda+n*lda*nlevels+n*lda;
366: PetscCall(BVAllocateWork_Private(bv,worklen));
367: tau = bv->work;
368: work = bv->work+n;
369: PetscCall(PetscBLASIntCast(n*nb,&lwork));
370: if (nlevels) {
371: A = bv->work+n+n*nb;
372: QQ = bv->work+n+n*nb+n*lda;
373: C = bv->work+n+n*nb+n*lda+n*lda*nlevels;
374: }
376: /* Compute QR */
377: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,Q,&m,tau,work,&lwork,&info));
378: SlepcCheckLapackInfo("geqrf",info);
380: /* Extract R */
381: if (R || nlevels) {
382: for (j=0;j<n;j++) {
383: for (i=0;i<=PetscMin(j,m-1);i++) {
384: if (nlevels) A[i+j*lda] = Q[i+j*m];
385: else R[i+j*ldr] = Q[i+j*m];
386: }
387: for (i=PetscMin(j,m-1)+1;i<n;i++) {
388: if (nlevels) A[i+j*lda] = 0.0;
389: else R[i+j*ldr] = 0.0;
390: }
391: }
392: }
394: /* Compute orthogonal matrix in Q */
395: PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&m,&k,&k,Q,&m,tau,work,&lwork,&info));
396: SlepcCheckLapackInfo("orgqr",info);
398: if (nlevels) {
400: PetscCall(PetscMPIIntCast(n,&count));
401: PetscCall(PetscMPIIntCast(lda,&stride));
402: PetscCall(PetscBLASIntCast(lda,&l));
403: PetscCallMPI(MPI_Type_vector(count,count,stride,MPIU_SCALAR,&tmat));
404: PetscCallMPI(MPI_Type_commit(&tmat));
406: for (level=nlevels;level>=1;level--) {
408: plevel = PetscPowInt(2,level);
409: PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
411: /* Stack triangular matrices */
412: if (rank<s && s<size) { /* send top part, receive bottom part */
413: PetscCallMPI(MPI_Sendrecv(A,1,tmat,s,111,A+n,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
414: } else if (s<size) { /* copy top to bottom, receive top part */
415: PetscCallMPI(MPI_Sendrecv(A,1,tmat,rank,111,A+n,1,tmat,rank,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
416: PetscCallMPI(MPI_Sendrecv(A+n,1,tmat,s,111,A,1,tmat,s,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
417: }
418: if (level<nlevels && size!=powtwo) { /* for cases when size is not a power of 2 */
419: if (rank<size-powtwo) { /* send bottom part */
420: PetscCallMPI(MPI_Send(A+n,1,tmat,rank+powtwo,111,PetscObjectComm((PetscObject)bv)));
421: } else if (rank>=powtwo) { /* receive bottom part */
422: PetscCallMPI(MPI_Recv(A+n,1,tmat,rank-powtwo,111,PetscObjectComm((PetscObject)bv),MPI_STATUS_IGNORE));
423: }
424: }
425: /* Compute QR and build orthogonal matrix */
426: if (level<nlevels || (level==nlevels && s<size)) {
427: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&l,&n,A,&l,tau,work,&lwork,&info));
428: SlepcCheckLapackInfo("geqrf",info);
429: PetscCall(PetscArraycpy(QQ+(level-1)*n*lda,A,n*lda));
430: PetscCallBLAS("LAPACKorgqr",LAPACKorgqr_(&l,&n,&n,QQ+(level-1)*n*lda,&l,tau,work,&lwork,&info));
431: SlepcCheckLapackInfo("orgqr",info);
432: for (j=0;j<n;j++) {
433: for (i=j+1;i<n;i++) A[i+j*lda] = 0.0;
434: }
435: } else if (level==nlevels) { /* only one triangular matrix, set Q=I */
436: PetscCall(PetscArrayzero(QQ+(level-1)*n*lda,n*lda));
437: for (j=0;j<n;j++) QQ[j+j*lda+(level-1)*n*lda] = 1.0;
438: }
439: }
441: /* Extract R */
442: if (R) {
443: for (j=0;j<n;j++) {
444: for (i=0;i<=j;i++) R[i+j*ldr] = A[i+j*lda];
445: for (i=j+1;i<n;i++) R[i+j*ldr] = 0.0;
446: }
447: }
449: /* Accumulate orthogonal matrices */
450: for (level=1;level<=nlevels;level++) {
451: plevel = PetscPowInt(2,level);
452: PetscCall(PetscBLASIntCast(plevel*PetscFloorReal(rank/(PetscReal)plevel)+(rank+PetscPowInt(2,level-1))%plevel,&s));
453: Qhalf = (rank<s)? QQ+(level-1)*n*lda: QQ+(level-1)*n*lda+n;
454: if (level<nlevels) {
455: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,QQ+level*n*lda,&l,Qhalf,&l,&zero,C,&l));
456: PetscCall(PetscArraycpy(QQ+level*n*lda,C,n*lda));
457: } else {
458: for (i=0;i<m/l;i++) {
459: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&n,&one,Q+i*l,&m,Qhalf,&l,&zero,C,&l));
460: for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+i*l+j*m,C+j*l,l));
461: }
462: sz = m%l;
463: if (sz) {
464: PetscCallBLAS("BLASgemm",BLASgemm_("N","N",&sz,&n,&n,&one,Q+(m/l)*l,&m,Qhalf,&l,&zero,C,&l));
465: for (j=0;j<n;j++) PetscCall(PetscArraycpy(Q+(m/l)*l+j*m,C+j*l,sz));
466: }
467: }
468: }
470: PetscCallMPI(MPI_Type_free(&tmat));
471: }
473: PetscCall(PetscLogFlops(3.0*m*n*n));
474: PetscCall(PetscFPTrapPop());
475: PetscFunctionReturn(PETSC_SUCCESS);
476: }
478: /*
479: Reduction operation to compute [~,Rout]=qr([Rin1;Rin2]) in the TSQR algorithm;
480: all matrices are upper triangular stored in packed format
481: */
482: SLEPC_EXTERN void MPIAPI SlepcGivensPacked(void *in,void *inout,PetscMPIInt *len,MPI_Datatype *datatype)
483: {
484: PetscBLASInt n,i,j,k,one=1;
485: PetscMPIInt tsize;
486: PetscScalar v,s,*R2=(PetscScalar*)in,*R1=(PetscScalar*)inout;
487: PetscReal c;
489: PetscFunctionBegin;
490: PetscCallMPIAbort(PETSC_COMM_SELF,MPI_Type_size(*datatype,&tsize)); /* we assume len=1 */
491: tsize /= sizeof(PetscScalar);
492: n = (-1+(PetscBLASInt)PetscSqrtReal(1+8*tsize))/2;
493: for (j=0;j<n;j++) {
494: for (i=0;i<=j;i++) {
495: LAPACKlartg_(R1+(2*n-j-1)*j/2+j,R2+(2*n-i-1)*i/2+j,&c,&s,&v);
496: R1[(2*n-j-1)*j/2+j] = v;
497: k = n-j-1;
498: if (k) BLASrot_(&k,R1+(2*n-j-1)*j/2+j+1,&one,R2+(2*n-i-1)*i/2+j+1,&one,&c,&s);
499: }
500: }
501: PetscFunctionReturnVoid();
502: }
504: /*
505: Computes the R factor of the QR factorization of an mxn matrix via parallel TSQR
506: */
507: PetscErrorCode BVOrthogonalize_LAPACK_TSQR_OnlyR(BV bv,PetscInt m_,PetscInt n_,PetscScalar *Q,PetscScalar *R,PetscInt ldr)
508: {
509: PetscInt worklen;
510: PetscBLASInt m,n,i,j,s,nb,lwork,info;
511: PetscScalar *tau,*work,*A=NULL,*R1=NULL,*R2=NULL;
512: PetscMPIInt size,count;
513: MPI_Datatype tmat;
515: PetscFunctionBegin;
516: PetscCall(PetscFPTrapPush(PETSC_FP_TRAP_OFF));
517: PetscCall(PetscBLASIntCast(m_,&m));
518: PetscCall(PetscBLASIntCast(n_,&n));
519: nb = 16;
520: s = n+n*(n-1)/2; /* length of packed triangular matrix */
521: PetscCallMPI(MPI_Comm_size(PetscObjectComm((PetscObject)bv),&size));
522: worklen = n+n*nb+2*s+m*n;
523: PetscCall(BVAllocateWork_Private(bv,worklen));
524: tau = bv->work;
525: work = bv->work+n;
526: R1 = bv->work+n+n*nb;
527: R2 = bv->work+n+n*nb+s;
528: A = bv->work+n+n*nb+2*s;
529: PetscCall(PetscBLASIntCast(n*nb,&lwork));
530: PetscCall(PetscArraycpy(A,Q,m*n));
532: /* Compute QR */
533: PetscCallBLAS("LAPACKgeqrf",LAPACKgeqrf_(&m,&n,A,&m,tau,work,&lwork,&info));
534: SlepcCheckLapackInfo("geqrf",info);
536: if (size==1) {
537: /* Extract R */
538: for (j=0;j<n;j++) {
539: for (i=0;i<=PetscMin(j,m-1);i++) R[i+j*ldr] = A[i+j*m];
540: for (i=PetscMin(j,m-1)+1;i<n;i++) R[i+j*ldr] = 0.0;
541: }
542: } else {
543: /* Use MPI reduction operation to obtain global R */
544: PetscCall(PetscMPIIntCast(s,&count));
545: PetscCallMPI(MPI_Type_contiguous(count,MPIU_SCALAR,&tmat));
546: PetscCallMPI(MPI_Type_commit(&tmat));
547: for (i=0;i<n;i++) {
548: for (j=i;j<n;j++) R1[(2*n-i-1)*i/2+j] = (i<m)?A[i+j*m]:0.0;
549: }
550: PetscCall(MPIU_Allreduce(R1,R2,1,tmat,MPIU_TSQR,PetscObjectComm((PetscObject)bv)));
551: for (i=0;i<n;i++) {
552: for (j=0;j<i;j++) R[i+j*ldr] = 0.0;
553: for (j=i;j<n;j++) R[i+j*ldr] = R2[(2*n-i-1)*i/2+j];
554: }
555: PetscCallMPI(MPI_Type_free(&tmat));
556: }
558: PetscCall(PetscLogFlops(3.0*m*n*n));
559: PetscCall(PetscFPTrapPop());
560: PetscFunctionReturn(PETSC_SUCCESS);
561: }