Actual source code: ex20.c

  1: static const char help[] = "PetscSF Ping-pong test to measure MPI latency\n\n";

  3: /*
  4:   This is a simple test to measure the latency of MPI communication.
  5:   The test is run with two processes.  The first process sends a message
  6:   to the second process, and after having received the message, the second
  7:   process sends a message back to the first process once.  The is repeated
  8:   a number of times.  The latency is defined as half time of the round-trip.

 10:   It mimics osu_latency from the OSU microbenchmarks (https://mvapich.cse.ohio-state.edu/benchmarks/).

 12:   Usage: mpirun -n 2 ./ex18 -mtype <type>
 13:   Other arguments have a default value that is also used in osu_latency.

 15:   Examples:

 17:   On Summit at OLCF:
 18:     jsrun --smpiargs "-gpu" -n 2 -a 1 -c 7 -g 1 -r 2 -l GPU-GPU -d packed -b packed:7 ./ex18  -mtype cuda

 20:   On Crusher at OLCF:
 21:     srun -n2 -c32 --cpu-bind=map_cpu:0,1 --gpus-per-node=8 --gpu-bind=map_gpu:0,1 ./ex18 -mtype hip
 22: */

 24: #include <petscsf.h>
 25: #include <petscdevice.h>
 26: #if defined(PETSC_HAVE_UNISTD_H)
 27: #include <unistd.h>
 28: #endif

 30: /* Same values as OSU microbenchmarks */
 31: #define LAT_LOOP_SMALL 10000
 32: #define LAT_SKIP_SMALL 100
 33: #define LAT_LOOP_LARGE 1000
 34: #define LAT_SKIP_LARGE 10
 35: #define LARGE_MESSAGE_SIZE 8192

 37: static inline PetscErrorCode PetscMallocWithMemType(PetscMemType mtype,size_t size,void** ptr)
 38: {
 39:   if (PetscMemTypeHost(mtype)) {
 40:     #if defined(PETSC_HAVE_GETPAGESIZE)
 41:       posix_memalign(ptr,getpagesize(),size);
 42:     #else
 43:       PetscMalloc(size,ptr);
 44:     #endif
 45:   }
 46: #if defined(PETSC_HAVE_CUDA)
 47:   else if (PetscMemTypeCUDA(mtype)) cudaMalloc(ptr,size);
 48: #elif defined(PETSC_HAVE_HIP)
 49:   else if (PetscMemTypeHIP(mtype))  hipMalloc(ptr,size);
 50: #endif
 51:   return 0;
 52: }

 54: static inline PetscErrorCode PetscFreeWithMemType_Private(PetscMemType mtype,void* ptr)
 55: {
 56:   if (PetscMemTypeHost(mtype)) {free(ptr);}
 57: #if defined(PETSC_HAVE_CUDA)
 58:   else if (PetscMemTypeCUDA(mtype)) cudaFree(ptr);
 59: #elif defined(PETSC_HAVE_HIP)
 60:   else if (PetscMemTypeHIP(mtype))  hipFree(ptr);
 61: #endif
 62:   return 0;
 63: }

 65: /* Free memory and set ptr to NULL when succeeded */
 66: #define PetscFreeWithMemType(t,p) ((p) && (PetscFreeWithMemType_Private((t),(p)) || ((p)=NULL,0)))

 68: static inline PetscErrorCode PetscMemcpyFromHostWithMemType(PetscMemType mtype,void* dst, const void *src, size_t n)
 69: {
 70:   if (PetscMemTypeHost(mtype)) PetscMemcpy(dst,src,n);
 71: #if defined(PETSC_HAVE_CUDA)
 72:   else if (PetscMemTypeCUDA(mtype)) cudaMemcpy(dst,src,n,cudaMemcpyHostToDevice);
 73: #elif defined(PETSC_HAVE_HIP)
 74:   else if (PetscMemTypeHIP(mtype))  hipMemcpy(dst,src,n,hipMemcpyHostToDevice);
 75: #endif
 76:   return 0;
 77: }

 79: int main(int argc,char **argv)
 80: {
 81:   PetscSF           sf[64];
 82:   PetscLogDouble    t_start=0,t_end=0,time[64];
 83:   PetscInt          i,j,n,nroots,nleaves,niter=100,nskip=10;
 84:   PetscInt          maxn=512*1024; /* max 4M bytes messages */
 85:   PetscSFNode       *iremote;
 86:   PetscMPIInt       rank,size;
 87:   PetscScalar       *rootdata=NULL,*leafdata=NULL,*pbuf,*ebuf;
 88:   size_t            msgsize;
 89:   PetscMemType      mtype = PETSC_MEMTYPE_HOST;
 90:   char              mstring[16]={0};
 91:   PetscBool         isCuda,isHip,isHost,set;
 92:   PetscInt          skipSmall=-1,loopSmall=-1;
 93:   MPI_Op            op = MPI_REPLACE;

 95:   PetscInitialize(&argc,&argv,NULL,help);
 96:   /* Must init the device first if one wants to call PetscGetMemType() without creating PETSc device objects */
 97: #if defined(PETSC_HAVE_CUDA)
 98:   PetscDeviceInitialize(PETSC_DEVICE_CUDA);
 99: #elif defined(PETSC_HAVE_HIP)
100:   PetscDeviceInitialize(PETSC_DEVICE_HIP);
101: #endif
102:   MPI_Comm_size(PETSC_COMM_WORLD,&size);
103:   MPI_Comm_rank(PETSC_COMM_WORLD,&rank);

106:   PetscOptionsGetInt(NULL,NULL,"-maxn",&maxn,NULL); /* maxn PetscScalars */
107:   PetscOptionsGetInt(NULL,NULL,"-skipSmall",&skipSmall,NULL);
108:   PetscOptionsGetInt(NULL,NULL,"-loopSmall",&loopSmall,NULL);

110:   PetscMalloc1(maxn,&iremote);
111:   PetscOptionsGetString(NULL,NULL,"-mtype",mstring,16,&set);
112:   if (set) {
113:     PetscStrcasecmp(mstring,"cuda",&isCuda);
114:     PetscStrcasecmp(mstring,"hip",&isHip);
115:     PetscStrcasecmp(mstring,"host",&isHost);

117:     if (isHost) mtype = PETSC_MEMTYPE_HOST;
118:     else if (isCuda) mtype = PETSC_MEMTYPE_CUDA;
119:     else if (isHip) mtype = PETSC_MEMTYPE_HIP;
120:     else SETERRQ(PETSC_COMM_WORLD,PETSC_ERR_ARG_WRONG,"Unknown memory type: %s",mstring);
121:   }

123:   PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&rootdata);
124:   PetscMallocWithMemType(mtype,sizeof(PetscScalar)*maxn,(void**)&leafdata);

126:   PetscMalloc2(maxn,&pbuf,maxn,&ebuf);
127:   for (i=0; i<maxn; i++) {
128:     pbuf[i] = 123.0;
129:     ebuf[i] = 456.0;
130:   }

132:   for (n=1,i=0; n<=maxn; n*=2,i++) {
133:     PetscSFCreate(PETSC_COMM_WORLD,&sf[i]);
134:     PetscSFSetFromOptions(sf[i]);
135:     if (!rank) {
136:       nroots  = n;
137:       nleaves = 0;
138:     } else {
139:       nroots  = 0;
140:       nleaves = n;
141:       for (j=0; j<nleaves; j++) {
142:         iremote[j].rank  = 0;
143:         iremote[j].index = j;
144:       }
145:     }
146:     PetscSFSetGraph(sf[i],nroots,nleaves,NULL,PETSC_COPY_VALUES,iremote,PETSC_COPY_VALUES);
147:     PetscSFSetUp(sf[i]);
148:   }

150:   if (loopSmall > 0) {
151:     nskip = skipSmall;
152:     niter = loopSmall;
153:   } else {
154:     nskip = LAT_SKIP_SMALL;
155:     niter = LAT_LOOP_SMALL;
156:   }

158:   for (n=1,j=0; n<=maxn; n*=2,j++) {
159:     msgsize = sizeof(PetscScalar)*n;
160:     PetscMemcpyFromHostWithMemType(mtype,rootdata,pbuf,msgsize);
161:     PetscMemcpyFromHostWithMemType(mtype,leafdata,ebuf,msgsize);

163:     if (msgsize > LARGE_MESSAGE_SIZE) {
164:       nskip = LAT_SKIP_LARGE;
165:       niter = LAT_LOOP_LARGE;
166:     }
167:     MPI_Barrier(MPI_COMM_WORLD);

169:     for (i=0; i<niter + nskip; i++) {
170:       if (i == nskip) {
171:        #if defined(PETSC_HAVE_CUDA)
172:         cudaDeviceSynchronize();
173:        #elif defined(PETSC_HAVE_HIP)
174:         hipDeviceSynchronize();
175:        #endif
176:         MPI_Barrier(PETSC_COMM_WORLD);
177:         t_start = MPI_Wtime();
178:       }
179:       PetscSFBcastWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,rootdata,mtype,leafdata,op);
180:       PetscSFBcastEnd(sf[j],MPIU_SCALAR,rootdata,leafdata,op);
181:       PetscSFReduceWithMemTypeBegin(sf[j],MPIU_SCALAR,mtype,leafdata,mtype,rootdata,op);
182:       PetscSFReduceEnd(sf[j],MPIU_SCALAR,leafdata,rootdata,op);
183:     }
184:    #if defined(PETSC_HAVE_CUDA)
185:     cudaDeviceSynchronize();
186:    #elif defined(PETSC_HAVE_HIP)
187:     hipDeviceSynchronize();
188:    #endif
189:     MPI_Barrier(PETSC_COMM_WORLD);
190:     t_end   = MPI_Wtime();
191:     time[j] = (t_end - t_start)*1e6 / (niter*2);
192:   }

194:   PetscPrintf(PETSC_COMM_WORLD,"\t##  PetscSF Ping-pong test on %s ##\n  Message(Bytes) \t\tLatency(us)\n", mtype==PETSC_MEMTYPE_HOST? "Host" : "Device");
195:   for (n=1,j=0; n<=maxn; n*=2,j++) {
196:     PetscSFDestroy(&sf[j]);
197:     PetscPrintf(PETSC_COMM_WORLD,"%16D \t %16.4f\n",sizeof(PetscScalar)*n,time[j]);
198:   }

200:   PetscFree2(pbuf,ebuf);
201:   PetscFreeWithMemType(mtype,rootdata);
202:   PetscFreeWithMemType(mtype,leafdata);
203:   PetscFree(iremote);
204:   PetscFinalize();
205:   return 0;
206: }

208: /**TEST
209:   testset:
210:     # use small numbers to make the test cheap
211:     args: -maxn 4 -skipSmall 1 -loopSmall 1
212:     filter: grep "DOES_NOT_EXIST"
213:     output_file: output/empty.out
214:     nsize: 2

216:     test:
217:       args: -mtype host

219:     test:
220:       requires: cuda
221:       args: -mtype cuda

223:     test:
224:       requires: hip
225:       args: -mtype hip
226: TEST**/