ICode9

精准搜索请尝试: 精确搜索
首页 > 系统相关> 文章详细

Linux MPI+CUDA混编

2021-08-25 12:34:00  阅读:279  来源: 互联网

标签:int float module MPI step CUDA Linux myid pi


Linux MPI+CUDA混编

源文件(main.cpp pi_cu.cu)

main.cpp

#include <mpi.h>
#include <stdio.h>
#include <stdlib.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13 // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block

// Kernel that executes on the CUDA device
void computePI(int nproc,int myid,float *sumHost,float step);

int main(int argc,char **argv) {
    int myid,nproc,tid, nbin;
    float pi=0.0, pig, step;
    float *sumHost;                        // Pointers to host arrays
    MPI_Init(&argc,&argv);
    MPI_Comm_rank(MPI_COMM_WORLD,&myid);  // My MPI rank
    MPI_Comm_size(MPI_COMM_WORLD,&nproc); // Number of MPI processes

    size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float); //Array memory size
    sumHost = (float *)malloc(size);      // Allocate array on host
    nbin = NBIN/nproc;                    // Number of bins per MPI process
    step = 1.0/(float)(nbin*nproc);       // Step size with redefined number of bins
    computePI(nproc,myid,sumHost,step);

   /* Reduction over CUDA threads */
    for(tid=0; tid<NUM_THREAD*NUM_BLOCK; tid++) pi += sumHost[tid];
    printf("step = %11.7f\n", step);
    pi *=step;
    free(sumHost);
    printf("myid = %d: partial pi = %11.7f\n",myid, pi);

    /* Reduction over MPI processes */
    MPI_Allreduce(&pi,&pig,1,MPI_FLOAT,MPI_SUM,MPI_COMM_WORLD);
    if (myid==0) printf("PI = %11.7f\n",pig);
    MPI_Finalize();
    return 0;
}

pi_cu.cu

#include<stdio.h>
#include<stdlib.h>
#define NBIN 10000000 // Number of bins
#define NUM_BLOCK 13  // Number of thread blocks
#define NUM_THREAD 192 // Number of threads per block

 __global__ void cal_pi(float *sum,int nbin,float step,float offset,int nthreads,int nblocks)
{
    int i;
    float x;
    int idx = blockIdx.x*blockDim.x+threadIdx.x; // Sequential thread index across blocks
    for (i=idx; i< nbin; i+=nthreads*nblocks) { // Interleaved bin assignment to threads
        x = offset+(i+0.5)*step;
        sum[idx] += 4.0/(1.0+x*x);
    }
}

void computePI(int nproc,int myid, float *sumHost,float step)
{
 int nbin;
 float offset;
 float *sumDev;                  // Pointers to device arrays
 dim3 dimGrid(NUM_BLOCK,1,1);    // Grid dimensions (only use 1D)
 dim3 dimBlock(NUM_THREAD,1,1);  // Block dimensions (only use 1D)
 
 nbin = NBIN/nproc;              // Number of bins per MPI process
 offset = myid*step*nbin;        // Quadrature-point offset
 size_t size = NUM_BLOCK*NUM_THREAD*sizeof(float);   //Array memory size
 cudaMalloc((void **) &sumDev,size);                 // Allocate array on device
 cudaMemset(sumDev,0,size);                          // Reset array in device to 0

 /*  Calculate on device (call CUDA kernel)  */
 cal_pi <<<dimGrid,dimBlock>>> (sumDev,nbin,step,offset,NUM_THREAD,NUM_BLOCK);
 
 /* Retrieve result from device and store it in host array  */
 cudaMemcpy(sumHost,sumDev,size,cudaMemcpyDeviceToHost);
 
 cudaFree(sumDev);
}

使用bsub脚本提交作业

test.bsub

#BSUB -W 0:10
#BSUB -n 4
#BSUB -R "span[ptile=2]"
#BSUB -q "gpu"
#BSUB -o res.out
#BSUB -e out.err
 
module unload compiler
module load compiler/intel/composer_xe_2013_sp1.0.080
module unload mpi
module load mpi/mvapich2/1.9/intel
module unload cuda
module load cuda/6.0.37
module load
mpijob.mvapich2 ./gpu-pi

提交作业

    bsub < test.bsub

Makefile文件

all:gpu-pi

CFLAGS+=-O3
NVCCFLAGS+=-I/soft/cuda/6.0.37/samples/common/inc/  -I/soft/cuda/6.0.37/include
NVCCFLAGS+=-I/soft/mpi/mvapich2/1.9/intel/include/ -Wno-deprecated-gpu-targets
NVCCLIB+=-L/soft/cuda/6.0.37/lib64 -lcudart 

gpu-pi: main.o  pi_cu.o
	mpicc $^ -o gpu-pi $(NVCCLIB) -lm  
	bsub < test.bsub        
	
%.o:%.cpp
	mpicc $(NVCCFLAGS) $(CFLAGS) -o $@ -c $^  -lm

%.o:%.cu 
	nvcc  $(NVCCFLAGS) $(CFLAGS) -o $@ -c $^ 
clean:
	rm -fr *.o *.err *.out gpu-pi
  • .cu文件和 .cpp文件分别用 nvcc 和 mpicc 编译,注意编译时需要包含CUDA和MPI的include路径;
  • 用mpicc链接,注意加上cuda的库;
  • 如果不提交作业,直接用mpirun运行是得不到结果的,因为没有利用到GPU
[scatmstu1@login4 pi2]$ mpirun -np 4 ./gpu-pi
step =   0.0000001
myid = 0: partial pi =   0.0000000
step =   0.0000001
myid = 3: partial pi =   0.0000000
step =   0.0000001
myid = 1: partial pi =   0.0000000
step =   0.0000001
myid = 2: partial pi =   0.0000000
PI =   0.0000000
  • 正确的运行结果( cat res.out )
[scatmstu1@login4 pi2]$ cat res.out 
Your job looked like:
------------------------------------------------------------
# LSBATCH: User input
#BSUB -W 0:10
#BSUB -n 4
#BSUB -R "span[ptile=2]"
#BSUB -q "gpu"
#BSUB -o res.out
#BSUB -e out.err
 
module unload compiler
module load compiler/intel/composer_xe_2013_sp1.0.080
module unload mpi
module load mpi/mvapich2/1.9/intel
module unload cuda
module load cuda/6.0.37
module load
mpijob.mvapich2 ./gpu-pi
------------------------------------------------------------
Successfully completed.
Resource usage summary:

    CPU time   :      8.34 sec.
    Max Memory :         2 MB
    Max Swap   :        22 MB

    Max Processes  :         1

The output (if any) follows:

step =   0.0000001
myid = 0: partial pi =   0.9799146
step =   0.0000001
myid = 1: partial pi =   0.8746758
step =   0.0000001
myid = 2: partial pi =   0.7194140
step =   0.0000001
myid = 3: partial pi =   0.5675882
PI =   3.1415925

PS:
Read file <out.err> for stderr output of this job.

标签:int,float,module,MPI,step,CUDA,Linux,myid,pi
来源: https://www.cnblogs.com/zhcnfyy/p/15184503.html

本站声明: 1. iCode9 技术分享网(下文简称本站)提供的所有内容,仅供技术学习、探讨和分享;
2. 关于本站的所有留言、评论、转载及引用,纯属内容发起人的个人观点,与本站观点和立场无关;
3. 关于本站的所有言论和文字,纯属内容发起人的个人观点,与本站观点和立场无关;
4. 本站文章均是网友提供,不完全保证技术分享内容的完整性、准确性、时效性、风险性和版权归属;如您发现该文章侵犯了您的权益,可联系我们第一时间进行删除;
5. 本站为非盈利性的个人网站,所有内容不会用来进行牟利,也不会利用任何形式的广告来间接获益,纯粹是为了广大技术爱好者提供技术内容和技术思想的分享性交流网站。

专注分享技术,共同学习,共同进步。侵权联系[81616952@qq.com]

Copyright (C)ICode9.com, All Rights Reserved.

ICode9版权所有