Actual source code: sveccuda.cu

slepc-3.19.0 2023-03-31
Report Typos and Errors
  1: /*
  2:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  3:    SLEPc - Scalable Library for Eigenvalue Problem Computations
  4:    Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain

  6:    This file is part of SLEPc.
  7:    SLEPc is distributed under a 2-clause BSD license (see LICENSE).
  8:    - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
  9: */
 10: /*
 11:    BV implemented as a single Vec (CUDA version)
 12: */

 14: #include <slepc/private/bvimpl.h>
 15: #include "../src/sys/classes/bv/impls/svec/svec.h"
 16: #include <slepccublas.h>

 18: #if defined(PETSC_USE_COMPLEX)
 19: #include <thrust/device_ptr.h>
 20: #endif

 22: #define BLOCKSIZE 64

 24: /*
 25:     B := alpha*A + beta*B

 27:     A,B are nxk (ld=n)
 28:  */
 29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
 30: {
 31:   PetscCuBLASInt m=0,one=1;
 32:   cublasHandle_t cublasv2handle;

 34:   PetscFunctionBegin;
 35:   (void)bv; // avoid unused parameter warning
 36:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 37:   PetscCall(PetscCuBLASIntCast(n_*k_,&m));
 38:   PetscCall(PetscLogGpuTimeBegin());
 39:   if (beta!=(PetscScalar)1.0) {
 40:     PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&beta,d_B,one));
 41:     PetscCall(PetscLogGpuFlops(1.0*m));
 42:   }
 43:   PetscCallCUBLAS(cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one));
 44:   PetscCall(PetscLogGpuTimeEnd());
 45:   PetscCall(PetscLogGpuFlops(2.0*m));
 46:   PetscFunctionReturn(PETSC_SUCCESS);
 47: }

 49: /*
 50:     C := alpha*A*Q + beta*C
 51: */
 52: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
 53: {
 54:   BV_SVEC           *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
 55:   const PetscScalar *d_px,*d_A,*q;
 56:   PetscScalar       *d_py,*d_q,*d_B,*d_C;
 57:   PetscInt          ldq,mq;
 58:   PetscCuBLASInt    m=0,n=0,k=0,ldq_=0;
 59:   cublasHandle_t    cublasv2handle;
 60:   PetscBool         matiscuda;

 62:   PetscFunctionBegin;
 63:   if (!Y->n) PetscFunctionReturn(PETSC_SUCCESS);
 64:   PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
 65:   if (beta==(PetscScalar)0.0) PetscCall(VecCUDAGetArrayWrite(y->v,&d_py));
 66:   else PetscCall(VecCUDAGetArray(y->v,&d_py));
 67:   d_A = d_px+(X->nc+X->l)*X->n;
 68:   d_C = d_py+(Y->nc+Y->l)*Y->n;
 69:   if (Q) {
 70:     PetscCall(PetscCuBLASIntCast(Y->n,&m));
 71:     PetscCall(PetscCuBLASIntCast(Y->k-Y->l,&n));
 72:     PetscCall(PetscCuBLASIntCast(X->k-X->l,&k));
 73:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
 74:     PetscCall(MatGetSize(Q,NULL,&mq));
 75:     PetscCall(MatDenseGetLDA(Q,&ldq));
 76:     PetscCall(PetscCuBLASIntCast(ldq,&ldq_));
 77:     PetscCall(PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda));
 78:     if (matiscuda) PetscCall(MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q));
 79:     else {
 80:       PetscCall(MatDenseGetArrayRead(Q,&q));
 81:       PetscCallCUDA(cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar)));
 82:       PetscCallCUDA(cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice));
 83:       PetscCall(PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar)));
 84:     }
 85:     d_B = d_q+Y->l*ldq+X->l;
 86:     PetscCall(PetscLogGpuTimeBegin());
 87:     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m));
 88:     PetscCall(PetscLogGpuTimeEnd());
 89:     if (matiscuda) PetscCall(MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q));
 90:     else {
 91:       PetscCall(MatDenseRestoreArrayRead(Q,&q));
 92:       PetscCallCUDA(cudaFree(d_q));
 93:     }
 94:     PetscCall(PetscLogGpuFlops(2.0*m*n*k));
 95:   } else PetscCall(BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C));
 96:   PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
 97:   PetscCall(VecCUDARestoreArrayWrite(y->v,&d_py));
 98:   PetscFunctionReturn(PETSC_SUCCESS);
 99: }

101: /*
102:     y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106:   BV_SVEC           *x = (BV_SVEC*)X->data;
107:   const PetscScalar *d_px,*d_A;
108:   PetscScalar       *d_py,*d_q,*d_x,*d_y;
109:   PetscCuBLASInt    n=0,k=0,one=1;
110:   cublasHandle_t    cublasv2handle;

112:   PetscFunctionBegin;
113:   PetscCall(PetscCuBLASIntCast(X->n,&n));
114:   PetscCall(PetscCuBLASIntCast(X->k-X->l,&k));
115:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
116:   PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
117:   if (beta==(PetscScalar)0.0) PetscCall(VecCUDAGetArrayWrite(y,&d_py));
118:   else PetscCall(VecCUDAGetArray(y,&d_py));
119:   if (!q) PetscCall(VecCUDAGetArray(X->buffer,&d_q));
120:   else {
121:     PetscCallCUDA(cudaMalloc((void**)&d_q,k*sizeof(PetscScalar)));
122:     PetscCallCUDA(cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice));
123:     PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar)));
124:   }
125:   d_A = d_px+(X->nc+X->l)*X->n;
126:   d_x = d_q;
127:   d_y = d_py;
128:   PetscCall(PetscLogGpuTimeBegin());
129:   PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one));
130:   PetscCall(PetscLogGpuTimeEnd());
131:   PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
132:   if (beta==(PetscScalar)0.0) PetscCall(VecCUDARestoreArrayWrite(y,&d_py));
133:   else PetscCall(VecCUDARestoreArray(y,&d_py));
134:   if (!q) PetscCall(VecCUDARestoreArray(X->buffer,&d_q));
135:   else PetscCallCUDA(cudaFree(d_q));
136:   PetscCall(PetscLogGpuFlops(2.0*n*k));
137:   PetscFunctionReturn(PETSC_SUCCESS);
138: }

140: /*
141:     A(:,s:e-1) := A*B(:,s:e-1)
142: */
143: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
144: {
145:   BV_SVEC           *ctx = (BV_SVEC*)V->data;
146:   PetscScalar       *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
147:   const PetscScalar *q;
148:   PetscInt          ldq,nq;
149:   PetscCuBLASInt    m=0,n=0,k=0,l=0,ldq_=0,bs=BLOCKSIZE;
150:   size_t            freemem,totmem;
151:   cublasHandle_t    cublasv2handle;
152:   PetscBool         matiscuda;

154:   PetscFunctionBegin;
155:   if (!V->n) PetscFunctionReturn(PETSC_SUCCESS);
156:   PetscCall(PetscCuBLASIntCast(V->n,&m));
157:   PetscCall(PetscCuBLASIntCast(e-s,&n));
158:   PetscCall(PetscCuBLASIntCast(V->k-V->l,&k));
159:   PetscCall(MatGetSize(Q,NULL,&nq));
160:   PetscCall(MatDenseGetLDA(Q,&ldq));
161:   PetscCall(PetscCuBLASIntCast(ldq,&ldq_));
162:   PetscCall(VecCUDAGetArray(ctx->v,&d_pv));
163:   PetscCall(PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda));
164:   if (matiscuda) PetscCall(MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q));
165:   else {
166:     PetscCall(MatDenseGetArrayRead(Q,&q));
167:     PetscCallCUDA(cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar)));
168:     PetscCallCUDA(cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice));
169:     PetscCall(PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar)));
170:   }
171:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
172:   PetscCall(PetscLogGpuTimeBegin());
173:   /* try to allocate the whole matrix */
174:   PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem));
175:   if (freemem>=m*n*sizeof(PetscScalar)) {
176:     PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
177:     d_A = d_pv+(V->nc+V->l)*m;
178:     d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
179:     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m));
180:     PetscCallCUDA(cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
181:   } else {
182:     PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs));
183:     PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar)));
184:     PetscCall(PetscCuBLASIntCast(m % bs,&l));
185:     if (l) {
186:       d_A = d_pv+(V->nc+V->l)*m;
187:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
188:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l));
189:       PetscCallCUDA(cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
190:     }
191:     for (;l<m;l+=bs) {
192:       d_A = d_pv+(V->nc+V->l)*m+l;
193:       d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
194:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs));
195:       PetscCallCUDA(cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
196:     }
197:   }
198:   PetscCall(PetscLogGpuTimeEnd());
199:   if (matiscuda) PetscCall(MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q));
200:   else {
201:     PetscCall(MatDenseRestoreArrayRead(Q,&q));
202:     PetscCallCUDA(cudaFree(d_q));
203:   }
204:   PetscCallCUDA(cudaFree(d_work));
205:   PetscCall(VecCUDARestoreArray(ctx->v,&d_pv));
206:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
207:   PetscFunctionReturn(PETSC_SUCCESS);
208: }

210: /*
211:     A(:,s:e-1) := A*B(:,s:e-1)
212: */
213: PetscErrorCode BVMultInPlaceHermitianTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
214: {
215:   BV_SVEC           *ctx = (BV_SVEC*)V->data;
216:   PetscScalar       *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
217:   const PetscScalar *q;
218:   PetscInt          ldq,nq;
219:   PetscCuBLASInt    m=0,n=0,k=0,ldq_=0;
220:   cublasHandle_t    cublasv2handle;
221:   PetscBool         matiscuda;

223:   PetscFunctionBegin;
224:   if (!V->n) PetscFunctionReturn(PETSC_SUCCESS);
225:   PetscCall(PetscCuBLASIntCast(V->n,&m));
226:   PetscCall(PetscCuBLASIntCast(e-s,&n));
227:   PetscCall(PetscCuBLASIntCast(V->k-V->l,&k));
228:   PetscCall(MatGetSize(Q,NULL,&nq));
229:   PetscCall(MatDenseGetLDA(Q,&ldq));
230:   PetscCall(PetscCuBLASIntCast(ldq,&ldq_));
231:   PetscCall(VecCUDAGetArray(ctx->v,&d_pv));
232:   PetscCall(PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda));
233:   if (matiscuda) PetscCall(MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q));
234:   else {
235:     PetscCall(MatDenseGetArrayRead(Q,&q));
236:     PetscCallCUDA(cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar)));
237:     PetscCallCUDA(cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice));
238:     PetscCall(PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar)));
239:   }
240:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
241:   PetscCall(PetscLogGpuTimeBegin());
242:   PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
243:   d_A = d_pv+(V->nc+V->l)*m;
244:   d_B = d_q+V->l*ldq+s;
245:   PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m));
246:   PetscCallCUDA(cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
247:   PetscCall(PetscLogGpuTimeEnd());
248:   if (matiscuda) PetscCall(MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q));
249:   else {
250:     PetscCall(MatDenseRestoreArrayRead(Q,&q));
251:     PetscCallCUDA(cudaFree(d_q));
252:   }
253:   PetscCallCUDA(cudaFree(d_work));
254:   PetscCall(VecCUDARestoreArray(ctx->v,&d_pv));
255:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
256:   PetscFunctionReturn(PETSC_SUCCESS);
257: }

259: /*
260:     C := A'*B
261: */
262: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
263: {
264:   BV_SVEC           *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
265:   const PetscScalar *d_px,*d_py,*d_A,*d_B;
266:   PetscScalar       *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
267:   PetscInt          j,ldm;
268:   PetscCuBLASInt    m=0,n=0,k=0,ldm_=0;
269:   PetscMPIInt       len;
270:   cublasHandle_t    cublasv2handle;

272:   PetscFunctionBegin;
273:   PetscCall(PetscCuBLASIntCast(Y->k-Y->l,&m));
274:   PetscCall(PetscCuBLASIntCast(X->k-X->l,&n));
275:   PetscCall(PetscCuBLASIntCast(X->n,&k));
276:   PetscCall(MatDenseGetLDA(M,&ldm));
277:   PetscCall(PetscCuBLASIntCast(ldm,&ldm_));
278:   PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
279:   PetscCall(VecCUDAGetArrayRead(y->v,&d_py));
280:   PetscCall(MatDenseGetArrayWrite(M,&pm));
281:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
282:   PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
283:   d_A = d_py+(Y->nc+Y->l)*Y->n;
284:   d_B = d_px+(X->nc+X->l)*X->n;
285:   C = pm+X->l*ldm+Y->l;
286:   if (x->mpi) {
287:     if (ldm==m) {
288:       PetscCall(BVAllocateWork_Private(X,m*n));
289:       if (k) {
290:         PetscCall(PetscLogGpuTimeBegin());
291:         PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_));
292:         PetscCall(PetscLogGpuTimeEnd());
293:         PetscCallCUDA(cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
294:         PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
295:       } else PetscCall(PetscArrayzero(X->work,m*n));
296:       PetscCall(PetscMPIIntCast(m*n,&len));
297:       PetscCall(MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X)));
298:     } else {
299:       PetscCall(BVAllocateWork_Private(X,2*m*n));
300:       CC = X->work+m*n;
301:       if (k) {
302:         PetscCall(PetscLogGpuTimeBegin());
303:         PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m));
304:         PetscCall(PetscLogGpuTimeEnd());
305:         PetscCallCUDA(cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
306:         PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
307:       } else PetscCall(PetscArrayzero(X->work,m*n));
308:       PetscCall(PetscMPIIntCast(m*n,&len));
309:       PetscCall(MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X)));
310:       for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldm,CC+j*m,m));
311:     }
312:   } else {
313:     if (k) {
314:       PetscCall(BVAllocateWork_Private(X,m*n));
315:       PetscCall(PetscLogGpuTimeBegin());
316:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m));
317:       PetscCall(PetscLogGpuTimeEnd());
318:       PetscCallCUDA(cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
319:       PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
320:       for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldm,X->work+j*m,m));
321:     }
322:   }
323:   PetscCallCUDA(cudaFree(d_work));
324:   PetscCall(MatDenseRestoreArrayWrite(M,&pm));
325:   PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
326:   PetscCall(VecCUDARestoreArrayRead(y->v,&d_py));
327:   PetscCall(PetscLogGpuFlops(2.0*m*n*k));
328:   PetscFunctionReturn(PETSC_SUCCESS);
329: }

331: #if defined(PETSC_USE_COMPLEX)
332: struct conjugate
333: {
334:   __host__ __device__
335:     PetscScalar operator()(PetscScalar x)
336:     {
337:       return PetscConj(x);
338:     }
339: };

341: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
342: {
343:   thrust::device_ptr<PetscScalar> ptr;

345:   PetscFunctionBegin;
346:   try {
347:     ptr = thrust::device_pointer_cast(a);
348:     thrust::transform(ptr,ptr+n,ptr,conjugate());
349:   } catch (char *ex) {
350:     SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
351:   }
352:   PetscFunctionReturn(PETSC_SUCCESS);
353: }
354: #endif

356: /*
357:     y := A'*x computed as y' := x'*A
358: */
359: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
360: {
361:   BV_SVEC           *x = (BV_SVEC*)X->data;
362:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
363:   PetscScalar       *d_work,szero=0.0,sone=1.0,*qq=q;
364:   PetscCuBLASInt    n=0,k=0,one=1;
365:   PetscMPIInt       len;
366:   Vec               z = y;
367:   cublasHandle_t    cublasv2handle;

369:   PetscFunctionBegin;
370:   PetscCall(PetscCuBLASIntCast(X->n,&n));
371:   PetscCall(PetscCuBLASIntCast(X->k-X->l,&k));
372:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
373:   if (X->matrix) {
374:     PetscCall(BV_IPMatMult(X,y));
375:     z = X->Bx;
376:   }
377:   PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
378:   PetscCall(VecCUDAGetArrayRead(z,&d_py));
379:   if (!q) PetscCall(VecCUDAGetArrayWrite(X->buffer,&d_work));
380:   else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar)));
381:   d_A = d_px+(X->nc+X->l)*X->n;
382:   d_x = d_py;
383:   if (x->mpi) {
384:     PetscCall(BVAllocateWork_Private(X,k));
385:     if (n) {
386:       PetscCall(PetscLogGpuTimeBegin());
387: #if defined(PETSC_USE_COMPLEX)
388:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one));
389:       PetscCall(ConjugateCudaArray(d_work,k));
390: #else
391:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one));
392: #endif
393:       PetscCall(PetscLogGpuTimeEnd());
394:       PetscCallCUDA(cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
395:       PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
396:     } else PetscCall(PetscArrayzero(X->work,k));
397:     if (!q) {
398:       PetscCall(VecCUDARestoreArrayWrite(X->buffer,&d_work));
399:       PetscCall(VecGetArray(X->buffer,&qq));
400:     } else PetscCallCUDA(cudaFree(d_work));
401:     PetscCall(PetscMPIIntCast(k,&len));
402:     PetscCall(MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X)));
403:     if (!q) PetscCall(VecRestoreArray(X->buffer,&qq));
404:   } else {
405:     if (n) {
406:       PetscCall(PetscLogGpuTimeBegin());
407: #if defined(PETSC_USE_COMPLEX)
408:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one));
409:       PetscCall(ConjugateCudaArray(d_work,k));
410: #else
411:       PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one));
412: #endif
413:       PetscCall(PetscLogGpuTimeEnd());
414:     }
415:     if (!q) PetscCall(VecCUDARestoreArrayWrite(X->buffer,&d_work));
416:     else {
417:       PetscCallCUDA(cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
418:       PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
419:       PetscCallCUDA(cudaFree(d_work));
420:     }
421:   }
422:   PetscCall(VecCUDARestoreArrayRead(z,&d_py));
423:   PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
424:   PetscCall(PetscLogGpuFlops(2.0*n*k));
425:   PetscFunctionReturn(PETSC_SUCCESS);
426: }

428: /*
429:     y := A'*x computed as y' := x'*A
430: */
431: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
432: {
433:   BV_SVEC           *x = (BV_SVEC*)X->data;
434:   const PetscScalar *d_A,*d_x,*d_px,*d_py;
435:   PetscScalar       *d_y,szero=0.0,sone=1.0;
436:   PetscCuBLASInt    n=0,k=0,one=1;
437:   Vec               z = y;
438:   cublasHandle_t    cublasv2handle;

440:   PetscFunctionBegin;
441:   PetscCall(PetscCuBLASIntCast(X->n,&n));
442:   PetscCall(PetscCuBLASIntCast(X->k-X->l,&k));
443:   if (X->matrix) {
444:     PetscCall(BV_IPMatMult(X,y));
445:     z = X->Bx;
446:   }
447:   PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
448:   PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
449:   PetscCall(VecCUDAGetArrayRead(z,&d_py));
450:   d_A = d_px+(X->nc+X->l)*X->n;
451:   d_x = d_py;
452:   if (n) {
453:     PetscCallCUDA(cudaMalloc((void**)&d_y,k*sizeof(PetscScalar)));
454:     PetscCall(PetscLogGpuTimeBegin());
455: #if defined(PETSC_USE_COMPLEX)
456:     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one));
457:     PetscCall(ConjugateCudaArray(d_y,k));
458: #else
459:     PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one));
460: #endif
461:     PetscCall(PetscLogGpuTimeEnd());
462:     PetscCallCUDA(cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
463:     PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
464:     PetscCallCUDA(cudaFree(d_y));
465:   }
466:   PetscCall(VecCUDARestoreArrayRead(z,&d_py));
467:   PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
468:   PetscCall(PetscLogGpuFlops(2.0*n*k));
469:   PetscFunctionReturn(PETSC_SUCCESS);
470: }

472: /*
473:     Scale n scalars
474: */
475: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
476: {
477:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
478:   PetscScalar    *d_array, *d_A;
479:   PetscCuBLASInt n=0,one=1;
480:   cublasHandle_t cublasv2handle;

482:   PetscFunctionBegin;
483:   PetscCall(VecCUDAGetArray(ctx->v,&d_array));
484:   if (j<0) {
485:     d_A = d_array+(bv->nc+bv->l)*bv->n;
486:     PetscCall(PetscCuBLASIntCast((bv->k-bv->l)*bv->n,&n));
487:   } else {
488:     d_A = d_array+(bv->nc+j)*bv->n;
489:     PetscCall(PetscCuBLASIntCast(bv->n,&n));
490:   }
491:   if (alpha == (PetscScalar)0.0) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar)));
492:   else if (alpha != (PetscScalar)1.0) {
493:     PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
494:     PetscCall(PetscLogGpuTimeBegin());
495:     PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one));
496:     PetscCall(PetscLogGpuTimeEnd());
497:     PetscCall(PetscLogGpuFlops(1.0*n));
498:   }
499:   PetscCall(VecCUDARestoreArray(ctx->v,&d_array));
500:   PetscFunctionReturn(PETSC_SUCCESS);
501: }

503: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
504: {
505:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
506:   Mat               Vmat,Wmat;
507:   const PetscScalar *d_pv;
508:   PetscScalar       *d_pw;
509:   PetscInt          j;

511:   PetscFunctionBegin;
512:   if (V->vmm) {
513:     PetscCall(BVGetMat(V,&Vmat));
514:     PetscCall(BVGetMat(W,&Wmat));
515:     PetscCall(MatProductCreateWithMat(A,Vmat,NULL,Wmat));
516:     PetscCall(MatProductSetType(Wmat,MATPRODUCT_AB));
517:     PetscCall(MatProductSetFromOptions(Wmat));
518:     PetscCall(MatProductSymbolic(Wmat));
519:     PetscCall(MatProductNumeric(Wmat));
520:     PetscCall(MatProductClear(Wmat));
521:     PetscCall(BVRestoreMat(V,&Vmat));
522:     PetscCall(BVRestoreMat(W,&Wmat));
523:   } else {
524:     PetscCall(VecCUDAGetArrayRead(v->v,&d_pv));
525:     PetscCall(VecCUDAGetArrayWrite(w->v,&d_pw));
526:     for (j=0;j<V->k-V->l;j++) {
527:       PetscCall(VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n));
528:       PetscCall(VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n));
529:       PetscCall(MatMult(A,V->cv[1],W->cv[1]));
530:       PetscCall(VecCUDAResetArray(V->cv[1]));
531:       PetscCall(VecCUDAResetArray(W->cv[1]));
532:     }
533:     PetscCall(VecCUDARestoreArrayRead(v->v,&d_pv));
534:     PetscCall(VecCUDARestoreArrayWrite(w->v,&d_pw));
535:   }
536:   PetscFunctionReturn(PETSC_SUCCESS);
537: }

539: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
540: {
541:   BV_SVEC           *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
542:   const PetscScalar *d_pv,*d_pvc;
543:   PetscScalar       *d_pw,*d_pwc;

545:   PetscFunctionBegin;
546:   PetscCall(VecCUDAGetArrayRead(v->v,&d_pv));
547:   PetscCall(VecCUDAGetArrayWrite(w->v,&d_pw));
548:   d_pvc = d_pv+(V->nc+V->l)*V->n;
549:   d_pwc = d_pw+(W->nc+W->l)*W->n;
550:   PetscCallCUDA(cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
551:   PetscCall(VecCUDARestoreArrayRead(v->v,&d_pv));
552:   PetscCall(VecCUDARestoreArrayWrite(w->v,&d_pw));
553:   PetscFunctionReturn(PETSC_SUCCESS);
554: }

556: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
557: {
558:   BV_SVEC        *v = (BV_SVEC*)V->data;
559:   PetscScalar    *d_pv;

561:   PetscFunctionBegin;
562:   PetscCall(VecCUDAGetArray(v->v,&d_pv));
563:   PetscCallCUDA(cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
564:   PetscCall(VecCUDARestoreArray(v->v,&d_pv));
565:   PetscFunctionReturn(PETSC_SUCCESS);
566: }

568: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
569: {
570:   BV_SVEC           *ctx = (BV_SVEC*)bv->data;
571:   const PetscScalar *d_pv;
572:   PetscScalar       *d_pnew;
573:   PetscInt          bs;
574:   Vec               vnew;
575:   char              str[50];

577:   PetscFunctionBegin;
578:   PetscCall(VecGetBlockSize(bv->t,&bs));
579:   PetscCall(VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew));
580:   PetscCall(VecSetType(vnew,((PetscObject)bv->t)->type_name));
581:   PetscCall(VecSetSizes(vnew,m*bv->n,PETSC_DECIDE));
582:   PetscCall(VecSetBlockSize(vnew,bs));
583:   if (((PetscObject)bv)->name) {
584:     PetscCall(PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name));
585:     PetscCall(PetscObjectSetName((PetscObject)vnew,str));
586:   }
587:   if (copy) {
588:     PetscCall(VecCUDAGetArrayRead(ctx->v,&d_pv));
589:     PetscCall(VecCUDAGetArrayWrite(vnew,&d_pnew));
590:     PetscCallCUDA(cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
591:     PetscCall(VecCUDARestoreArrayRead(ctx->v,&d_pv));
592:     PetscCall(VecCUDARestoreArrayWrite(vnew,&d_pnew));
593:   }
594:   PetscCall(VecDestroy(&ctx->v));
595:   ctx->v = vnew;
596:   PetscFunctionReturn(PETSC_SUCCESS);
597: }

599: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
600: {
601:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
602:   PetscScalar    *d_pv;
603:   PetscInt       l;

605:   PetscFunctionBegin;
606:   l = BVAvailableVec;
607:   PetscCall(VecCUDAGetArray(ctx->v,&d_pv));
608:   PetscCall(VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n));
609:   (void)v; // avoid unused parameter warning
610:   PetscFunctionReturn(PETSC_SUCCESS);
611: }

613: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
614: {
615:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
616:   PetscInt       l;

618:   PetscFunctionBegin;
619:   l = (j==bv->ci[0])? 0: 1;
620:   PetscCall(VecCUDAResetArray(bv->cv[l]));
621:   PetscCall(VecCUDARestoreArray(ctx->v,NULL));
622:   (void)v; // avoid unused parameter warning
623:   PetscFunctionReturn(PETSC_SUCCESS);
624: }

626: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
627: {
628:   Vec               v;
629:   const PetscScalar *d_pv;
630:   PetscObjectState  lstate,rstate;
631:   PetscBool         change=PETSC_FALSE;

633:   PetscFunctionBegin;
634:   /* force sync flag to PETSC_CUDA_BOTH */
635:   if (L) {
636:     PetscCall(PetscObjectStateGet((PetscObject)*L,&lstate));
637:     if (lstate != bv->lstate) {
638:       v = ((BV_SVEC*)bv->L->data)->v;
639:       PetscCall(VecCUDAGetArrayRead(v,&d_pv));
640:       PetscCall(VecCUDARestoreArrayRead(v,&d_pv));
641:       change = PETSC_TRUE;
642:     }
643:   }
644:   if (R) {
645:     PetscCall(PetscObjectStateGet((PetscObject)*R,&rstate));
646:     if (rstate != bv->rstate) {
647:       v = ((BV_SVEC*)bv->R->data)->v;
648:       PetscCall(VecCUDAGetArrayRead(v,&d_pv));
649:       PetscCall(VecCUDARestoreArrayRead(v,&d_pv));
650:       change = PETSC_TRUE;
651:     }
652:   }
653:   if (change) {
654:     v = ((BV_SVEC*)bv->data)->v;
655:     PetscCall(VecCUDAGetArray(v,(PetscScalar **)&d_pv));
656:     PetscCall(VecCUDARestoreArray(v,(PetscScalar **)&d_pv));
657:   }
658:   PetscFunctionReturn(PETSC_SUCCESS);
659: }

661: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
662: {
663:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
664:   PetscScalar    *vv,*aa;
665:   PetscBool      create=PETSC_FALSE;
666:   PetscInt       m,cols;

668:   PetscFunctionBegin;
669:   m = bv->k-bv->l;
670:   if (!bv->Aget) create=PETSC_TRUE;
671:   else {
672:     PetscCall(MatDenseCUDAGetArray(bv->Aget,&aa));
673:     PetscCheck(!aa,PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
674:     PetscCall(MatGetSize(bv->Aget,NULL,&cols));
675:     if (cols!=m) {
676:       PetscCall(MatDestroy(&bv->Aget));
677:       create=PETSC_TRUE;
678:     }
679:   }
680:   PetscCall(VecCUDAGetArray(ctx->v,&vv));
681:   if (create) {
682:     PetscCall(MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget)); /* pass a pointer to avoid allocation of storage */
683:     PetscCall(MatDenseCUDAReplaceArray(bv->Aget,NULL));  /* replace with a null pointer, the value after BVRestoreMat */
684:   }
685:   PetscCall(MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n));  /* set the actual pointer */
686:   *A = bv->Aget;
687:   PetscFunctionReturn(PETSC_SUCCESS);
688: }

690: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
691: {
692:   BV_SVEC        *ctx = (BV_SVEC*)bv->data;
693:   PetscScalar    *vv,*aa;

695:   PetscFunctionBegin;
696:   PetscCall(MatDenseCUDAGetArray(bv->Aget,&aa));
697:   vv = aa-(bv->nc+bv->l)*bv->n;
698:   PetscCall(MatDenseCUDAResetArray(bv->Aget));
699:   PetscCall(VecCUDARestoreArray(ctx->v,&vv));
700:   *A = NULL;
701:   PetscFunctionReturn(PETSC_SUCCESS);
702: }