Actual source code: bvorthogcuda.cu
slepc-3.19.0 2023-03-31
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV orthogonalization routines (CUDA)
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
16: #include <slepccublas.h>
18: /*
19: BV_CleanCoefficients_CUDA - Sets to zero all entries of column j of the bv buffer
20: */
21: PetscErrorCode BV_CleanCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h)
22: {
23: PetscScalar *d_hh,*d_a;
24: PetscInt i;
26: PetscFunctionBegin;
27: if (!h) {
28: PetscCall(VecCUDAGetArray(bv->buffer,&d_a));
29: PetscCall(PetscLogGpuTimeBegin());
30: d_hh = d_a + j*(bv->nc+bv->m);
31: PetscCallCUDA(cudaMemset(d_hh,0,(bv->nc+j)*sizeof(PetscScalar)));
32: PetscCall(PetscLogGpuTimeEnd());
33: PetscCall(VecCUDARestoreArray(bv->buffer,&d_a));
34: } else { /* cpu memory */
35: for (i=0;i<bv->nc+j;i++) h[i] = 0.0;
36: }
37: PetscFunctionReturn(PETSC_SUCCESS);
38: }
40: /*
41: BV_AddCoefficients_CUDA - Add the contents of the scratch (0-th column) of the bv buffer
42: into column j of the bv buffer
43: */
44: PetscErrorCode BV_AddCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *c)
45: {
46: PetscScalar *d_h,*d_c,sone=1.0;
47: PetscInt i;
48: PetscCuBLASInt idx=0,one=1;
49: cublasHandle_t cublasv2handle;
51: PetscFunctionBegin;
52: if (!h) {
53: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
54: PetscCall(VecCUDAGetArray(bv->buffer,&d_c));
55: d_h = d_c + j*(bv->nc+bv->m);
56: PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
57: PetscCall(PetscLogGpuTimeBegin());
58: PetscCallCUBLAS(cublasXaxpy(cublasv2handle,idx,&sone,d_c,one,d_h,one));
59: PetscCall(PetscLogGpuTimeEnd());
60: PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
61: PetscCall(VecCUDARestoreArray(bv->buffer,&d_c));
62: } else { /* cpu memory */
63: for (i=0;i<bv->nc+j;i++) h[i] += c[i];
64: PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
65: }
66: PetscFunctionReturn(PETSC_SUCCESS);
67: }
69: /*
70: BV_SetValue_CUDA - Sets value in row j (counted after the constraints) of column k
71: of the coefficients array
72: */
73: PetscErrorCode BV_SetValue_CUDA(BV bv,PetscInt j,PetscInt k,PetscScalar *h,PetscScalar value)
74: {
75: PetscScalar *d_h,*a;
77: PetscFunctionBegin;
78: if (!h) {
79: PetscCall(VecCUDAGetArray(bv->buffer,&a));
80: PetscCall(PetscLogGpuTimeBegin());
81: d_h = a + k*(bv->nc+bv->m) + bv->nc+j;
82: PetscCallCUDA(cudaMemcpy(d_h,&value,sizeof(PetscScalar),cudaMemcpyHostToDevice));
83: PetscCall(PetscLogCpuToGpu(sizeof(PetscScalar)));
84: PetscCall(PetscLogGpuTimeEnd());
85: PetscCall(VecCUDARestoreArray(bv->buffer,&a));
86: } else { /* cpu memory */
87: h[bv->nc+j] = value;
88: }
89: PetscFunctionReturn(PETSC_SUCCESS);
90: }
92: /*
93: BV_SquareSum_CUDA - Returns the value h'*h, where h represents the contents of the
94: coefficients array (up to position j)
95: */
96: PetscErrorCode BV_SquareSum_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *sum)
97: {
98: const PetscScalar *d_h;
99: PetscScalar dot;
100: PetscInt i;
101: PetscCuBLASInt idx=0,one=1;
102: cublasHandle_t cublasv2handle;
104: PetscFunctionBegin;
105: if (!h) {
106: PetscCall(PetscCUBLASGetHandle(&cublasv2handle));
107: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
108: PetscCall(PetscCuBLASIntCast(bv->nc+j,&idx));
109: PetscCall(PetscLogGpuTimeBegin());
110: PetscCallCUBLAS(cublasXdotc(cublasv2handle,idx,d_h,one,d_h,one,&dot));
111: PetscCall(PetscLogGpuTimeEnd());
112: PetscCall(PetscLogGpuFlops(2.0*(bv->nc+j)));
113: *sum = PetscRealPart(dot);
114: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
115: } else { /* cpu memory */
116: *sum = 0.0;
117: for (i=0;i<bv->nc+j;i++) *sum += PetscRealPart(h[i]*PetscConj(h[i]));
118: PetscCall(PetscLogFlops(2.0*(bv->nc+j)));
119: }
120: PetscFunctionReturn(PETSC_SUCCESS);
121: }
123: /* pointwise multiplication */
124: __global__ void PointwiseMult_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
125: {
126: PetscInt x;
128: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
129: if (x<n) a[x] *= PetscRealPart(b[x]);
130: }
132: /* pointwise division */
133: __global__ void PointwiseDiv_kernel(PetscInt xcount,PetscScalar *a,const PetscScalar *b,PetscInt n)
134: {
135: PetscInt x;
137: x = xcount*gridDim.x*blockDim.x+blockIdx.x*blockDim.x+threadIdx.x;
138: if (x<n) a[x] /= PetscRealPart(b[x]);
139: }
141: /*
142: BV_ApplySignature_CUDA - Computes the pointwise product h*omega, where h represents
143: the contents of the coefficients array (up to position j) and omega is the signature;
144: if inverse=TRUE then the operation is h/omega
145: */
146: PetscErrorCode BV_ApplySignature_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscBool inverse)
147: {
148: PetscScalar *d_h;
149: const PetscScalar *d_omega,*omega;
150: PetscInt i,xcount;
151: dim3 blocks3d, threads3d;
153: PetscFunctionBegin;
154: if (!(bv->nc+j)) PetscFunctionReturn(PETSC_SUCCESS);
155: if (!h) {
156: PetscCall(VecCUDAGetArray(bv->buffer,&d_h));
157: PetscCall(VecCUDAGetArrayRead(bv->omega,&d_omega));
158: PetscCall(SlepcKernelSetGrid1D(bv->nc+j,&blocks3d,&threads3d,&xcount));
159: PetscCall(PetscLogGpuTimeBegin());
160: if (inverse) {
161: for (i=0;i<xcount;i++) PointwiseDiv_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
162: } else {
163: for (i=0;i<xcount;i++) PointwiseMult_kernel<<<blocks3d,threads3d>>>(i,d_h,d_omega,bv->nc+j);
164: }
165: PetscCallCUDA(cudaGetLastError());
166: PetscCall(PetscLogGpuTimeEnd());
167: PetscCall(PetscLogGpuFlops(1.0*(bv->nc+j)));
168: PetscCall(VecCUDARestoreArrayRead(bv->omega,&d_omega));
169: PetscCall(VecCUDARestoreArray(bv->buffer,&d_h));
170: } else {
171: PetscCall(VecGetArrayRead(bv->omega,&omega));
172: if (inverse) for (i=0;i<bv->nc+j;i++) h[i] /= PetscRealPart(omega[i]);
173: else for (i=0;i<bv->nc+j;i++) h[i] *= PetscRealPart(omega[i]);
174: PetscCall(VecRestoreArrayRead(bv->omega,&omega));
175: PetscCall(PetscLogFlops(1.0*(bv->nc+j)));
176: }
177: PetscFunctionReturn(PETSC_SUCCESS);
178: }
180: /*
181: BV_SquareRoot_CUDA - Returns the square root of position j (counted after the constraints)
182: of the coefficients array
183: */
184: PetscErrorCode BV_SquareRoot_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscReal *beta)
185: {
186: const PetscScalar *d_h;
187: PetscScalar hh;
189: PetscFunctionBegin;
190: if (!h) {
191: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_h));
192: PetscCall(PetscLogGpuTimeBegin());
193: PetscCallCUDA(cudaMemcpy(&hh,d_h+bv->nc+j,sizeof(PetscScalar),cudaMemcpyDeviceToHost));
194: PetscCall(PetscLogGpuToCpu(sizeof(PetscScalar)));
195: PetscCall(PetscLogGpuTimeEnd());
196: PetscCall(BV_SafeSqrt(bv,hh,beta));
197: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_h));
198: } else PetscCall(BV_SafeSqrt(bv,h[bv->nc+j],beta));
199: PetscFunctionReturn(PETSC_SUCCESS);
200: }
202: /*
203: BV_StoreCoefficients_CUDA - Copy the contents of the coefficients array to an array dest
204: provided by the caller (only values from l to j are copied)
205: */
206: PetscErrorCode BV_StoreCoefficients_CUDA(BV bv,PetscInt j,PetscScalar *h,PetscScalar *dest)
207: {
208: const PetscScalar *d_h,*d_a;
209: PetscInt i;
211: PetscFunctionBegin;
212: if (!h) {
213: PetscCall(VecCUDAGetArrayRead(bv->buffer,&d_a));
214: PetscCall(PetscLogGpuTimeBegin());
215: d_h = d_a + j*(bv->nc+bv->m)+bv->nc;
216: PetscCallCUDA(cudaMemcpy(dest-bv->l,d_h,(j-bv->l)*sizeof(PetscScalar),cudaMemcpyDeviceToHost));
217: PetscCall(PetscLogGpuToCpu((j-bv->l)*sizeof(PetscScalar)));
218: PetscCall(PetscLogGpuTimeEnd());
219: PetscCall(VecCUDARestoreArrayRead(bv->buffer,&d_a));
220: } else {
221: for (i=bv->l;i<j;i++) dest[i-bv->l] = h[bv->nc+i];
222: }
223: PetscFunctionReturn(PETSC_SUCCESS);
224: }