Actual source code: sveccuda.cu
slepc-3.19.0 2023-03-31
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV implemented as a single Vec (CUDA version)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include "../src/sys/classes/bv/impls/svec/svec.h"
16: #include <slepccublas.h>
18: #if defined(PETSC_USE_COMPLEX)
19: #include <thrust/device_ptr.h>
20: #endif
22: #define BLOCKSIZE 64
24: /*
25: B := alpha*A + beta*B
27: A,B are nxk (ld=n)
28: */
29: static PetscErrorCode BVAXPY_BLAS_CUDA(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *d_A,PetscScalar beta,PetscScalar *d_B)
30: {
31: PetscCuBLASInt m=0,one=1;
32: cublasHandle_t cublasv2handle;
34: PetscFunctionBegin;
35: (void)bv; // avoid unused parameter warning
36: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
37: PetscCall(PetscCuBLASIntCast(n_*k_,&m));
38: PetscCall(PetscLogGpuTimeBegin());
39: if (beta!=(PetscScalar)1.0) {
40: PetscCallCUBLAS(cublasXscal(cublasv2handle,m,&beta,d_B,one));
41: PetscCall(PetscLogGpuFlops(1.0*m));
42: }
43: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,m,&alpha,d_A,one,d_B,one));
44: PetscCall(PetscLogGpuTimeEnd());
45: PetscCall(PetscLogGpuFlops(2.0*m));
46: PetscFunctionReturn(PETSC_SUCCESS);
47: }
49: /*
50: C := alpha*A*Q + beta*C
51: */
52: PetscErrorCode BVMult_Svec_CUDA(BV Y,PetscScalar alpha,PetscScalar beta,BV X,Mat Q)
53: {
54: BV_SVEC *y = (BV_SVEC*)Y->data,*x = (BV_SVEC*)X->data;
55: const PetscScalar *d_px,*d_A,*q;
56: PetscScalar *d_py,*d_q,*d_B,*d_C;
57: PetscInt ldq,mq;
58: PetscCuBLASInt m=0,n=0,k=0,ldq_=0;
59: cublasHandle_t cublasv2handle;
60: PetscBool matiscuda;
62: PetscFunctionBegin;
63: if (!Y->n) PetscFunctionReturn(PETSC_SUCCESS);
64: PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
65: if (beta==(PetscScalar)0.0) PetscCall(VecCUDAGetArrayWrite(y->v,&d_py));
66: else PetscCall(VecCUDAGetArray(y->v,&d_py));
67: d_A = d_px+(X->nc+X->l)*X->n;
68: d_C = d_py+(Y->nc+Y->l)*Y->n;
69: if (Q) {
70: PetscCall(PetscCuBLASIntCast(Y->n,&m));
71: PetscCall(PetscCuBLASIntCast(Y->k-Y->l,&n));
72: PetscCall(PetscCuBLASIntCast(X->k-X->l,&k));
73: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
74: PetscCall(MatGetSize(Q,NULL,&mq));
75: PetscCall(MatDenseGetLDA(Q,&ldq));
76: PetscCall(PetscCuBLASIntCast(ldq,&ldq_));
77: PetscCall(PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda));
78: if (matiscuda) PetscCall(MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q));
79: else {
80: PetscCall(MatDenseGetArrayRead(Q,&q));
81: PetscCallCUDA(cudaMalloc((void**)&d_q,ldq*mq*sizeof(PetscScalar)));
82: PetscCallCUDA(cudaMemcpy(d_q,q,ldq*mq*sizeof(PetscScalar),cudaMemcpyHostToDevice));
83: PetscCall(PetscLogCpuToGpu(ldq*mq*sizeof(PetscScalar)));
84: }
85: d_B = d_q+Y->l*ldq+X->l;
86: PetscCall(PetscLogGpuTimeBegin());
87: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&alpha,d_A,m,d_B,ldq_,&beta,d_C,m));
88: PetscCall(PetscLogGpuTimeEnd());
89: if (matiscuda) PetscCall(MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q));
90: else {
91: PetscCall(MatDenseRestoreArrayRead(Q,&q));
92: PetscCallCUDA(cudaFree(d_q));
93: }
94: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
95: } else PetscCall(BVAXPY_BLAS_CUDA(Y,Y->n,Y->k-Y->l,alpha,d_A,beta,d_C));
96: PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
97: PetscCall(VecCUDARestoreArrayWrite(y->v,&d_py));
98: PetscFunctionReturn(PETSC_SUCCESS);
99: }
101: /*
102: y := alpha*A*x + beta*y
103: */
104: PetscErrorCode BVMultVec_Svec_CUDA(BV X,PetscScalar alpha,PetscScalar beta,Vec y,PetscScalar *q)
105: {
106: BV_SVEC *x = (BV_SVEC*)X->data;
107: const PetscScalar *d_px,*d_A;
108: PetscScalar *d_py,*d_q,*d_x,*d_y;
109: PetscCuBLASInt n=0,k=0,one=1;
110: cublasHandle_t cublasv2handle;
112: PetscFunctionBegin;
113: PetscCall(PetscCuBLASIntCast(X->n,&n));
114: PetscCall(PetscCuBLASIntCast(X->k-X->l,&k));
115: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
116: PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
117: if (beta==(PetscScalar)0.0) PetscCall(VecCUDAGetArrayWrite(y,&d_py));
118: else PetscCall(VecCUDAGetArray(y,&d_py));
119: if (!q) PetscCall(VecCUDAGetArray(X->buffer,&d_q));
120: else {
121: PetscCallCUDA(cudaMalloc((void**)&d_q,k*sizeof(PetscScalar)));
122: PetscCallCUDA(cudaMemcpy(d_q,q,k*sizeof(PetscScalar),cudaMemcpyHostToDevice));
123: PetscCall(PetscLogCpuToGpu(k*sizeof(PetscScalar)));
124: }
125: d_A = d_px+(X->nc+X->l)*X->n;
126: d_x = d_q;
127: d_y = d_py;
128: PetscCall(PetscLogGpuTimeBegin());
129: PetscCallCUBLAS(cublasXgemv(cublasv2handle,CUBLAS_OP_N,n,k,&alpha,d_A,n,d_x,one,&beta,d_y,one));
130: PetscCall(PetscLogGpuTimeEnd());
131: PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
132: if (beta==(PetscScalar)0.0) PetscCall(VecCUDARestoreArrayWrite(y,&d_py));
133: else PetscCall(VecCUDARestoreArray(y,&d_py));
134: if (!q) PetscCall(VecCUDARestoreArray(X->buffer,&d_q));
135: else PetscCallCUDA(cudaFree(d_q));
136: PetscCall(PetscLogGpuFlops(2.0*n*k));
137: PetscFunctionReturn(PETSC_SUCCESS);
138: }
140: /*
141: A(:,s:e-1) := A*B(:,s:e-1)
142: */
143: PetscErrorCode BVMultInPlace_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
144: {
145: BV_SVEC *ctx = (BV_SVEC*)V->data;
146: PetscScalar *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
147: const PetscScalar *q;
148: PetscInt ldq,nq;
149: PetscCuBLASInt m=0,n=0,k=0,l=0,ldq_=0,bs=BLOCKSIZE;
150: size_t freemem,totmem;
151: cublasHandle_t cublasv2handle;
152: PetscBool matiscuda;
154: PetscFunctionBegin;
155: if (!V->n) PetscFunctionReturn(PETSC_SUCCESS);
156: PetscCall(PetscCuBLASIntCast(V->n,&m));
157: PetscCall(PetscCuBLASIntCast(e-s,&n));
158: PetscCall(PetscCuBLASIntCast(V->k-V->l,&k));
159: PetscCall(MatGetSize(Q,NULL,&nq));
160: PetscCall(MatDenseGetLDA(Q,&ldq));
161: PetscCall(PetscCuBLASIntCast(ldq,&ldq_));
162: PetscCall(VecCUDAGetArray(ctx->v,&d_pv));
163: PetscCall(PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda));
164: if (matiscuda) PetscCall(MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q));
165: else {
166: PetscCall(MatDenseGetArrayRead(Q,&q));
167: PetscCallCUDA(cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar)));
168: PetscCallCUDA(cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice));
169: PetscCall(PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar)));
170: }
171: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
172: PetscCall(PetscLogGpuTimeBegin());
173: /* try to allocate the whole matrix */
174: PetscCallCUDA(cudaMemGetInfo(&freemem,&totmem));
175: if (freemem>=m*n*sizeof(PetscScalar)) {
176: PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
177: d_A = d_pv+(V->nc+V->l)*m;
178: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
179: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m));
180: PetscCallCUDA(cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
181: } else {
182: PetscCall(PetscCuBLASIntCast(freemem/(m*sizeof(PetscScalar)),&bs));
183: PetscCallCUDA(cudaMalloc((void**)&d_work,bs*n*sizeof(PetscScalar)));
184: PetscCall(PetscCuBLASIntCast(m % bs,&l));
185: if (l) {
186: d_A = d_pv+(V->nc+V->l)*m;
187: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
188: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,l,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,l));
189: PetscCallCUDA(cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,l*sizeof(PetscScalar),l*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
190: }
191: for (;l<m;l+=bs) {
192: d_A = d_pv+(V->nc+V->l)*m+l;
193: d_B = d_q+V->l*ldq+V->l+(s-V->l)*ldq;
194: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,bs,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,bs));
195: PetscCallCUDA(cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,bs*sizeof(PetscScalar),bs*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
196: }
197: }
198: PetscCall(PetscLogGpuTimeEnd());
199: if (matiscuda) PetscCall(MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q));
200: else {
201: PetscCall(MatDenseRestoreArrayRead(Q,&q));
202: PetscCallCUDA(cudaFree(d_q));
203: }
204: PetscCallCUDA(cudaFree(d_work));
205: PetscCall(VecCUDARestoreArray(ctx->v,&d_pv));
206: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
207: PetscFunctionReturn(PETSC_SUCCESS);
208: }
210: /*
211: A(:,s:e-1) := A*B(:,s:e-1)
212: */
213: PetscErrorCode BVMultInPlaceHermitianTranspose_Svec_CUDA(BV V,Mat Q,PetscInt s,PetscInt e)
214: {
215: BV_SVEC *ctx = (BV_SVEC*)V->data;
216: PetscScalar *d_pv,*d_q,*d_A,*d_B,*d_work,sone=1.0,szero=0.0;
217: const PetscScalar *q;
218: PetscInt ldq,nq;
219: PetscCuBLASInt m=0,n=0,k=0,ldq_=0;
220: cublasHandle_t cublasv2handle;
221: PetscBool matiscuda;
223: PetscFunctionBegin;
224: if (!V->n) PetscFunctionReturn(PETSC_SUCCESS);
225: PetscCall(PetscCuBLASIntCast(V->n,&m));
226: PetscCall(PetscCuBLASIntCast(e-s,&n));
227: PetscCall(PetscCuBLASIntCast(V->k-V->l,&k));
228: PetscCall(MatGetSize(Q,NULL,&nq));
229: PetscCall(MatDenseGetLDA(Q,&ldq));
230: PetscCall(PetscCuBLASIntCast(ldq,&ldq_));
231: PetscCall(VecCUDAGetArray(ctx->v,&d_pv));
232: PetscCall(PetscObjectTypeCompare((PetscObject)Q,MATSEQDENSECUDA,&matiscuda));
233: if (matiscuda) PetscCall(MatDenseCUDAGetArrayRead(Q,(const PetscScalar**)&d_q));
234: else {
235: PetscCall(MatDenseGetArrayRead(Q,&q));
236: PetscCallCUDA(cudaMalloc((void**)&d_q,ldq*nq*sizeof(PetscScalar)));
237: PetscCallCUDA(cudaMemcpy(d_q,q,ldq*nq*sizeof(PetscScalar),cudaMemcpyHostToDevice));
238: PetscCall(PetscLogCpuToGpu(ldq*nq*sizeof(PetscScalar)));
239: }
240: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
241: PetscCall(PetscLogGpuTimeBegin());
242: PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
243: d_A = d_pv+(V->nc+V->l)*m;
244: d_B = d_q+V->l*ldq+s;
245: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_C,m,n,k,&sone,d_A,m,d_B,ldq_,&szero,d_work,m));
246: PetscCallCUDA(cudaMemcpy2D(d_A+(s-V->l)*m,m*sizeof(PetscScalar),d_work,m*sizeof(PetscScalar),m*sizeof(PetscScalar),n,cudaMemcpyDeviceToDevice));
247: PetscCall(PetscLogGpuTimeEnd());
248: if (matiscuda) PetscCall(MatDenseCUDARestoreArrayRead(Q,(const PetscScalar**)&d_q));
249: else {
250: PetscCall(MatDenseRestoreArrayRead(Q,&q));
251: PetscCallCUDA(cudaFree(d_q));
252: }
253: PetscCallCUDA(cudaFree(d_work));
254: PetscCall(VecCUDARestoreArray(ctx->v,&d_pv));
255: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
256: PetscFunctionReturn(PETSC_SUCCESS);
257: }
259: /*
260: C := A'*B
261: */
262: PetscErrorCode BVDot_Svec_CUDA(BV X,BV Y,Mat M)
263: {
264: BV_SVEC *x = (BV_SVEC*)X->data,*y = (BV_SVEC*)Y->data;
265: const PetscScalar *d_px,*d_py,*d_A,*d_B;
266: PetscScalar *pm,*d_work,sone=1.0,szero=0.0,*C,*CC;
267: PetscInt j,ldm;
268: PetscCuBLASInt m=0,n=0,k=0,ldm_=0;
269: PetscMPIInt len;
270: cublasHandle_t cublasv2handle;
272: PetscFunctionBegin;
273: PetscCall(PetscCuBLASIntCast(Y->k-Y->l,&m));
274: PetscCall(PetscCuBLASIntCast(X->k-X->l,&n));
275: PetscCall(PetscCuBLASIntCast(X->n,&k));
276: PetscCall(MatDenseGetLDA(M,&ldm));
277: PetscCall(PetscCuBLASIntCast(ldm,&ldm_));
278: PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
279: PetscCall(VecCUDAGetArrayRead(y->v,&d_py));
280: PetscCall(MatDenseGetArrayWrite(M,&pm));
281: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
282: PetscCallCUDA(cudaMalloc((void**)&d_work,m*n*sizeof(PetscScalar)));
283: d_A = d_py+(Y->nc+Y->l)*Y->n;
284: d_B = d_px+(X->nc+X->l)*X->n;
285: C = pm+X->l*ldm+Y->l;
286: if (x->mpi) {
287: if (ldm==m) {
288: PetscCall(BVAllocateWork_Private(X,m*n));
289: if (k) {
290: PetscCall(PetscLogGpuTimeBegin());
291: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,ldm_));
292: PetscCall(PetscLogGpuTimeEnd());
293: PetscCallCUDA(cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
294: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
295: } else PetscCall(PetscArrayzero(X->work,m*n));
296: PetscCall(PetscMPIIntCast(m*n,&len));
297: PetscCall(MPIU_Allreduce(X->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X)));
298: } else {
299: PetscCall(BVAllocateWork_Private(X,2*m*n));
300: CC = X->work+m*n;
301: if (k) {
302: PetscCall(PetscLogGpuTimeBegin());
303: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m));
304: PetscCall(PetscLogGpuTimeEnd());
305: PetscCallCUDA(cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
306: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
307: } else PetscCall(PetscArrayzero(X->work,m*n));
308: PetscCall(PetscMPIIntCast(m*n,&len));
309: PetscCall(MPIU_Allreduce(X->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X)));
310: for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldm,CC+j*m,m));
311: }
312: } else {
313: if (k) {
314: PetscCall(BVAllocateWork_Private(X,m*n));
315: PetscCall(PetscLogGpuTimeBegin());
316: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,m,n,k,&sone,d_A,k,d_B,k,&szero,d_work,m));
317: PetscCall(PetscLogGpuTimeEnd());
318: PetscCallCUDA(cudaMemcpy(X->work,d_work,m*n*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
319: PetscCall(PetscLogGpuToCpu(m*n*sizeof(PetscScalar)));
320: for (j=0;j<n;j++) PetscCall(PetscArraycpy(C+j*ldm,X->work+j*m,m));
321: }
322: }
323: PetscCallCUDA(cudaFree(d_work));
324: PetscCall(MatDenseRestoreArrayWrite(M,&pm));
325: PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
326: PetscCall(VecCUDARestoreArrayRead(y->v,&d_py));
327: PetscCall(PetscLogGpuFlops(2.0*m*n*k));
328: PetscFunctionReturn(PETSC_SUCCESS);
329: }
331: #if defined(PETSC_USE_COMPLEX)
332: struct conjugate
333: {
334: __host__ __device__
335: PetscScalar operator()(PetscScalar x)
336: {
337: return PetscConj(x);
338: }
339: };
341: PetscErrorCode ConjugateCudaArray(PetscScalar *a, PetscInt n)
342: {
343: thrust::device_ptr<PetscScalar> ptr;
345: PetscFunctionBegin;
346: try {
347: ptr = thrust::device_pointer_cast(a);
348: thrust::transform(ptr,ptr+n,ptr,conjugate());
349: } catch (char *ex) {
350: SETERRQ(PETSC_COMM_SELF,PETSC_ERR_LIB,"Thrust error: %s", ex);
351: }
352: PetscFunctionReturn(PETSC_SUCCESS);
353: }
354: #endif
356: /*
357: y := A'*x computed as y' := x'*A
358: */
359: PetscErrorCode BVDotVec_Svec_CUDA(BV X,Vec y,PetscScalar *q)
360: {
361: BV_SVEC *x = (BV_SVEC*)X->data;
362: const PetscScalar *d_A,*d_x,*d_px,*d_py;
363: PetscScalar *d_work,szero=0.0,sone=1.0,*qq=q;
364: PetscCuBLASInt n=0,k=0,one=1;
365: PetscMPIInt len;
366: Vec z = y;
367: cublasHandle_t cublasv2handle;
369: PetscFunctionBegin;
370: PetscCall(PetscCuBLASIntCast(X->n,&n));
371: PetscCall(PetscCuBLASIntCast(X->k-X->l,&k));
372: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
373: if (X->matrix) {
374: PetscCall(BV_IPMatMult(X,y));
375: z = X->Bx;
376: }
377: PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
378: PetscCall(VecCUDAGetArrayRead(z,&d_py));
379: if (!q) PetscCall(VecCUDAGetArrayWrite(X->buffer,&d_work));
380: else PetscCallCUDA(cudaMalloc((void**)&d_work,k*sizeof(PetscScalar)));
381: d_A = d_px+(X->nc+X->l)*X->n;
382: d_x = d_py;
383: if (x->mpi) {
384: PetscCall(BVAllocateWork_Private(X,k));
385: if (n) {
386: PetscCall(PetscLogGpuTimeBegin());
387: #if defined(PETSC_USE_COMPLEX)
388: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one));
389: PetscCall(ConjugateCudaArray(d_work,k));
390: #else
391: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one));
392: #endif
393: PetscCall(PetscLogGpuTimeEnd());
394: PetscCallCUDA(cudaMemcpy(X->work,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
395: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
396: } else PetscCall(PetscArrayzero(X->work,k));
397: if (!q) {
398: PetscCall(VecCUDARestoreArrayWrite(X->buffer,&d_work));
399: PetscCall(VecGetArray(X->buffer,&qq));
400: } else PetscCallCUDA(cudaFree(d_work));
401: PetscCall(PetscMPIIntCast(k,&len));
402: PetscCall(MPIU_Allreduce(X->work,qq,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)X)));
403: if (!q) PetscCall(VecRestoreArray(X->buffer,&qq));
404: } else {
405: if (n) {
406: PetscCall(PetscLogGpuTimeBegin());
407: #if defined(PETSC_USE_COMPLEX)
408: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_work,one));
409: PetscCall(ConjugateCudaArray(d_work,k));
410: #else
411: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_work,one));
412: #endif
413: PetscCall(PetscLogGpuTimeEnd());
414: }
415: if (!q) PetscCall(VecCUDARestoreArrayWrite(X->buffer,&d_work));
416: else {
417: PetscCallCUDA(cudaMemcpy(q,d_work,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
418: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
419: PetscCallCUDA(cudaFree(d_work));
420: }
421: }
422: PetscCall(VecCUDARestoreArrayRead(z,&d_py));
423: PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
424: PetscCall(PetscLogGpuFlops(2.0*n*k));
425: PetscFunctionReturn(PETSC_SUCCESS);
426: }
428: /*
429: y := A'*x computed as y' := x'*A
430: */
431: PetscErrorCode BVDotVec_Local_Svec_CUDA(BV X,Vec y,PetscScalar *m)
432: {
433: BV_SVEC *x = (BV_SVEC*)X->data;
434: const PetscScalar *d_A,*d_x,*d_px,*d_py;
435: PetscScalar *d_y,szero=0.0,sone=1.0;
436: PetscCuBLASInt n=0,k=0,one=1;
437: Vec z = y;
438: cublasHandle_t cublasv2handle;
440: PetscFunctionBegin;
441: PetscCall(PetscCuBLASIntCast(X->n,&n));
442: PetscCall(PetscCuBLASIntCast(X->k-X->l,&k));
443: if (X->matrix) {
444: PetscCall(BV_IPMatMult(X,y));
445: z = X->Bx;
446: }
447: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
448: PetscCall(VecCUDAGetArrayRead(x->v,&d_px));
449: PetscCall(VecCUDAGetArrayRead(z,&d_py));
450: d_A = d_px+(X->nc+X->l)*X->n;
451: d_x = d_py;
452: if (n) {
453: PetscCallCUDA(cudaMalloc((void**)&d_y,k*sizeof(PetscScalar)));
454: PetscCall(PetscLogGpuTimeBegin());
455: #if defined(PETSC_USE_COMPLEX)
456: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_C,CUBLAS_OP_N,one,k,n,&sone,d_x,n,d_A,n,&szero,d_y,one));
457: PetscCall(ConjugateCudaArray(d_y,k));
458: #else
459: PetscCallCUBLAS(cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,one,k,n,&sone,d_x,one,d_A,n,&szero,d_y,one));
460: #endif
461: PetscCall(PetscLogGpuTimeEnd());
462: PetscCallCUDA(cudaMemcpy(m,d_y,k*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
463: PetscCall(PetscLogGpuToCpu(k*sizeof(PetscScalar)));
464: PetscCallCUDA(cudaFree(d_y));
465: }
466: PetscCall(VecCUDARestoreArrayRead(z,&d_py));
467: PetscCall(VecCUDARestoreArrayRead(x->v,&d_px));
468: PetscCall(PetscLogGpuFlops(2.0*n*k));
469: PetscFunctionReturn(PETSC_SUCCESS);
470: }
472: /*
473: Scale n scalars
474: */
475: PetscErrorCode BVScale_Svec_CUDA(BV bv,PetscInt j,PetscScalar alpha)
476: {
477: BV_SVEC *ctx = (BV_SVEC*)bv->data;
478: PetscScalar *d_array, *d_A;
479: PetscCuBLASInt n=0,one=1;
480: cublasHandle_t cublasv2handle;
482: PetscFunctionBegin;
483: PetscCall(VecCUDAGetArray(ctx->v,&d_array));
484: if (j<0) {
485: d_A = d_array+(bv->nc+bv->l)*bv->n;
486: PetscCall(PetscCuBLASIntCast((bv->k-bv->l)*bv->n,&n));
487: } else {
488: d_A = d_array+(bv->nc+j)*bv->n;
489: PetscCall(PetscCuBLASIntCast(bv->n,&n));
490: }
491: if (alpha == (PetscScalar)0.0) PetscCallCUDA(cudaMemset(d_A,0,n*sizeof(PetscScalar)));
492: else if (alpha != (PetscScalar)1.0) {
493: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
494: PetscCall(PetscLogGpuTimeBegin());
495: PetscCallCUBLAS(cublasXscal(cublasv2handle,n,&alpha,d_A,one));
496: PetscCall(PetscLogGpuTimeEnd());
497: PetscCall(PetscLogGpuFlops(1.0*n));
498: }
499: PetscCall(VecCUDARestoreArray(ctx->v,&d_array));
500: PetscFunctionReturn(PETSC_SUCCESS);
501: }
503: PetscErrorCode BVMatMult_Svec_CUDA(BV V,Mat A,BV W)
504: {
505: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
506: Mat Vmat,Wmat;
507: const PetscScalar *d_pv;
508: PetscScalar *d_pw;
509: PetscInt j;
511: PetscFunctionBegin;
512: if (V->vmm) {
513: PetscCall(BVGetMat(V,&Vmat));
514: PetscCall(BVGetMat(W,&Wmat));
515: PetscCall(MatProductCreateWithMat(A,Vmat,NULL,Wmat));
516: PetscCall(MatProductSetType(Wmat,MATPRODUCT_AB));
517: PetscCall(MatProductSetFromOptions(Wmat));
518: PetscCall(MatProductSymbolic(Wmat));
519: PetscCall(MatProductNumeric(Wmat));
520: PetscCall(MatProductClear(Wmat));
521: PetscCall(BVRestoreMat(V,&Vmat));
522: PetscCall(BVRestoreMat(W,&Wmat));
523: } else {
524: PetscCall(VecCUDAGetArrayRead(v->v,&d_pv));
525: PetscCall(VecCUDAGetArrayWrite(w->v,&d_pw));
526: for (j=0;j<V->k-V->l;j++) {
527: PetscCall(VecCUDAPlaceArray(V->cv[1],(PetscScalar *)d_pv+(V->nc+V->l+j)*V->n));
528: PetscCall(VecCUDAPlaceArray(W->cv[1],d_pw+(W->nc+W->l+j)*W->n));
529: PetscCall(MatMult(A,V->cv[1],W->cv[1]));
530: PetscCall(VecCUDAResetArray(V->cv[1]));
531: PetscCall(VecCUDAResetArray(W->cv[1]));
532: }
533: PetscCall(VecCUDARestoreArrayRead(v->v,&d_pv));
534: PetscCall(VecCUDARestoreArrayWrite(w->v,&d_pw));
535: }
536: PetscFunctionReturn(PETSC_SUCCESS);
537: }
539: PetscErrorCode BVCopy_Svec_CUDA(BV V,BV W)
540: {
541: BV_SVEC *v = (BV_SVEC*)V->data,*w = (BV_SVEC*)W->data;
542: const PetscScalar *d_pv,*d_pvc;
543: PetscScalar *d_pw,*d_pwc;
545: PetscFunctionBegin;
546: PetscCall(VecCUDAGetArrayRead(v->v,&d_pv));
547: PetscCall(VecCUDAGetArrayWrite(w->v,&d_pw));
548: d_pvc = d_pv+(V->nc+V->l)*V->n;
549: d_pwc = d_pw+(W->nc+W->l)*W->n;
550: PetscCallCUDA(cudaMemcpy(d_pwc,d_pvc,(V->k-V->l)*V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
551: PetscCall(VecCUDARestoreArrayRead(v->v,&d_pv));
552: PetscCall(VecCUDARestoreArrayWrite(w->v,&d_pw));
553: PetscFunctionReturn(PETSC_SUCCESS);
554: }
556: PetscErrorCode BVCopyColumn_Svec_CUDA(BV V,PetscInt j,PetscInt i)
557: {
558: BV_SVEC *v = (BV_SVEC*)V->data;
559: PetscScalar *d_pv;
561: PetscFunctionBegin;
562: PetscCall(VecCUDAGetArray(v->v,&d_pv));
563: PetscCallCUDA(cudaMemcpy(d_pv+(V->nc+i)*V->n,d_pv+(V->nc+j)*V->n,V->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
564: PetscCall(VecCUDARestoreArray(v->v,&d_pv));
565: PetscFunctionReturn(PETSC_SUCCESS);
566: }
568: PetscErrorCode BVResize_Svec_CUDA(BV bv,PetscInt m,PetscBool copy)
569: {
570: BV_SVEC *ctx = (BV_SVEC*)bv->data;
571: const PetscScalar *d_pv;
572: PetscScalar *d_pnew;
573: PetscInt bs;
574: Vec vnew;
575: char str[50];
577: PetscFunctionBegin;
578: PetscCall(VecGetBlockSize(bv->t,&bs));
579: PetscCall(VecCreate(PetscObjectComm((PetscObject)bv->t),&vnew));
580: PetscCall(VecSetType(vnew,((PetscObject)bv->t)->type_name));
581: PetscCall(VecSetSizes(vnew,m*bv->n,PETSC_DECIDE));
582: PetscCall(VecSetBlockSize(vnew,bs));
583: if (((PetscObject)bv)->name) {
584: PetscCall(PetscSNPrintf(str,sizeof(str),"%s_0",((PetscObject)bv)->name));
585: PetscCall(PetscObjectSetName((PetscObject)vnew,str));
586: }
587: if (copy) {
588: PetscCall(VecCUDAGetArrayRead(ctx->v,&d_pv));
589: PetscCall(VecCUDAGetArrayWrite(vnew,&d_pnew));
590: PetscCallCUDA(cudaMemcpy(d_pnew,d_pv,PetscMin(m,bv->m)*bv->n*sizeof(PetscScalar),cudaMemcpyDeviceToDevice));
591: PetscCall(VecCUDARestoreArrayRead(ctx->v,&d_pv));
592: PetscCall(VecCUDARestoreArrayWrite(vnew,&d_pnew));
593: }
594: PetscCall(VecDestroy(&ctx->v));
595: ctx->v = vnew;
596: PetscFunctionReturn(PETSC_SUCCESS);
597: }
599: PetscErrorCode BVGetColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
600: {
601: BV_SVEC *ctx = (BV_SVEC*)bv->data;
602: PetscScalar *d_pv;
603: PetscInt l;
605: PetscFunctionBegin;
606: l = BVAvailableVec;
607: PetscCall(VecCUDAGetArray(ctx->v,&d_pv));
608: PetscCall(VecCUDAPlaceArray(bv->cv[l],d_pv+(bv->nc+j)*bv->n));
609: (void)v; // avoid unused parameter warning
610: PetscFunctionReturn(PETSC_SUCCESS);
611: }
613: PetscErrorCode BVRestoreColumn_Svec_CUDA(BV bv,PetscInt j,Vec *v)
614: {
615: BV_SVEC *ctx = (BV_SVEC*)bv->data;
616: PetscInt l;
618: PetscFunctionBegin;
619: l = (j==bv->ci[0])? 0: 1;
620: PetscCall(VecCUDAResetArray(bv->cv[l]));
621: PetscCall(VecCUDARestoreArray(ctx->v,NULL));
622: (void)v; // avoid unused parameter warning
623: PetscFunctionReturn(PETSC_SUCCESS);
624: }
626: PetscErrorCode BVRestoreSplit_Svec_CUDA(BV bv,BV *L,BV *R)
627: {
628: Vec v;
629: const PetscScalar *d_pv;
630: PetscObjectState lstate,rstate;
631: PetscBool change=PETSC_FALSE;
633: PetscFunctionBegin;
634: /* force sync flag to PETSC_CUDA_BOTH */
635: if (L) {
636: PetscCall(PetscObjectStateGet((PetscObject)*L,&lstate));
637: if (lstate != bv->lstate) {
638: v = ((BV_SVEC*)bv->L->data)->v;
639: PetscCall(VecCUDAGetArrayRead(v,&d_pv));
640: PetscCall(VecCUDARestoreArrayRead(v,&d_pv));
641: change = PETSC_TRUE;
642: }
643: }
644: if (R) {
645: PetscCall(PetscObjectStateGet((PetscObject)*R,&rstate));
646: if (rstate != bv->rstate) {
647: v = ((BV_SVEC*)bv->R->data)->v;
648: PetscCall(VecCUDAGetArrayRead(v,&d_pv));
649: PetscCall(VecCUDARestoreArrayRead(v,&d_pv));
650: change = PETSC_TRUE;
651: }
652: }
653: if (change) {
654: v = ((BV_SVEC*)bv->data)->v;
655: PetscCall(VecCUDAGetArray(v,(PetscScalar **)&d_pv));
656: PetscCall(VecCUDARestoreArray(v,(PetscScalar **)&d_pv));
657: }
658: PetscFunctionReturn(PETSC_SUCCESS);
659: }
661: PetscErrorCode BVGetMat_Svec_CUDA(BV bv,Mat *A)
662: {
663: BV_SVEC *ctx = (BV_SVEC*)bv->data;
664: PetscScalar *vv,*aa;
665: PetscBool create=PETSC_FALSE;
666: PetscInt m,cols;
668: PetscFunctionBegin;
669: m = bv->k-bv->l;
670: if (!bv->Aget) create=PETSC_TRUE;
671: else {
672: PetscCall(MatDenseCUDAGetArray(bv->Aget,&aa));
673: PetscCheck(!aa,PetscObjectComm((PetscObject)bv),PETSC_ERR_ARG_WRONGSTATE,"BVGetMat already called on this BV");
674: PetscCall(MatGetSize(bv->Aget,NULL,&cols));
675: if (cols!=m) {
676: PetscCall(MatDestroy(&bv->Aget));
677: create=PETSC_TRUE;
678: }
679: }
680: PetscCall(VecCUDAGetArray(ctx->v,&vv));
681: if (create) {
682: PetscCall(MatCreateDenseCUDA(PetscObjectComm((PetscObject)bv),bv->n,PETSC_DECIDE,bv->N,m,vv,&bv->Aget)); /* pass a pointer to avoid allocation of storage */
683: PetscCall(MatDenseCUDAReplaceArray(bv->Aget,NULL)); /* replace with a null pointer, the value after BVRestoreMat */
684: }
685: PetscCall(MatDenseCUDAPlaceArray(bv->Aget,vv+(bv->nc+bv->l)*bv->n)); /* set the actual pointer */
686: *A = bv->Aget;
687: PetscFunctionReturn(PETSC_SUCCESS);
688: }
690: PetscErrorCode BVRestoreMat_Svec_CUDA(BV bv,Mat *A)
691: {
692: BV_SVEC *ctx = (BV_SVEC*)bv->data;
693: PetscScalar *vv,*aa;
695: PetscFunctionBegin;
696: PetscCall(MatDenseCUDAGetArray(bv->Aget,&aa));
697: vv = aa-(bv->nc+bv->l)*bv->n;
698: PetscCall(MatDenseCUDAResetArray(bv->Aget));
699: PetscCall(VecCUDARestoreArray(ctx->v,&vv));
700: *A = NULL;
701: PetscFunctionReturn(PETSC_SUCCESS);
702: }