diff --git a/benchmarks/mp_pingpong_all.cu b/benchmarks/mp_pingpong_all.cu index 89b86ba..6fad0b0 100644 --- a/benchmarks/mp_pingpong_all.cu +++ b/benchmarks/mp_pingpong_all.cu @@ -842,8 +842,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); } - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) fprintf(stdout, "%10d", size); diff --git a/benchmarks/mp_pingpong_kernel.cu b/benchmarks/mp_pingpong_kernel.cu index 6e1efd1..743aedf 100644 --- a/benchmarks/mp_pingpong_kernel.cu +++ b/benchmarks/mp_pingpong_kernel.cu @@ -232,8 +232,8 @@ double sr_exchange (MPI_Comm comm, int size, int iter_count, int validate, doubl CUDA_CHECK(cudaMalloc((void **)&rbuf_d, size*iter_count)); CUDA_CHECK(cudaMemset(rbuf_d, 0, size*iter_count)); - MP_CHECK(mp_register(sbuf_d, size*iter_count, &sreg)); - MP_CHECK(mp_register(rbuf_d, size*iter_count, &rreg)); + MP_CHECK(mp_register(sbuf_d, size*iter_count, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, size*iter_count, &rreg, 0)); if (validate) { mp_dbg_msg("initializing the buffer \n"); diff --git a/benchmarks/mp_pingpong_kernel_stream.cu b/benchmarks/mp_pingpong_kernel_stream.cu index 3251fbd..0b3b6ae 100644 --- a/benchmarks/mp_pingpong_kernel_stream.cu +++ b/benchmarks/mp_pingpong_kernel_stream.cu @@ -455,8 +455,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/benchmarks/mp_pingpong_kernel_stream_latency.cu b/benchmarks/mp_pingpong_kernel_stream_latency.cu index 513d4fb..dd57020 100644 --- a/benchmarks/mp_pingpong_kernel_stream_latency.cu +++ b/benchmarks/mp_pingpong_kernel_stream_latency.cu @@ -539,8 +539,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) fprintf(stdout, "%10d", size); diff --git a/benchmarks/mp_pingpong_kernel_stream_latency_mpi.cu b/benchmarks/mp_pingpong_kernel_stream_latency_mpi.cu index 7a25633..7b2f2cb 100644 --- a/benchmarks/mp_pingpong_kernel_stream_latency_mpi.cu +++ b/benchmarks/mp_pingpong_kernel_stream_latency_mpi.cu @@ -719,8 +719,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); } - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) fprintf(stdout, "%10d", size); diff --git a/benchmarks/mp_pingpong_kernel_stream_mpi.cu b/benchmarks/mp_pingpong_kernel_stream_mpi.cu index f744b2c..93d32cd 100644 --- a/benchmarks/mp_pingpong_kernel_stream_mpi.cu +++ b/benchmarks/mp_pingpong_kernel_stream_mpi.cu @@ -641,8 +641,8 @@ int main (int argc, char *argv[]) CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); } - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (my_rank == 0) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/benchmarks/mp_pingpong_kernel_stream_wait_send.cu b/benchmarks/mp_pingpong_kernel_stream_wait_send.cu index 3aa92d0..f32a960 100644 --- a/benchmarks/mp_pingpong_kernel_stream_wait_send.cu +++ b/benchmarks/mp_pingpong_kernel_stream_wait_send.cu @@ -388,8 +388,8 @@ int main (int c, char *v[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/benchmarks/mp_producer_consumer_kernel_stream.cu b/benchmarks/mp_producer_consumer_kernel_stream.cu index 0c1e8ac..94754bd 100644 --- a/benchmarks/mp_producer_consumer_kernel_stream.cu +++ b/benchmarks/mp_producer_consumer_kernel_stream.cu @@ -373,8 +373,8 @@ int main (int c, char *v[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/benchmarks/mp_sendrecv_kernel_stream.cu b/benchmarks/mp_sendrecv_kernel_stream.cu index 4f04ff2..c86864e 100644 --- a/benchmarks/mp_sendrecv_kernel_stream.cu +++ b/benchmarks/mp_sendrecv_kernel_stream.cu @@ -359,8 +359,8 @@ int main (int c, char *v[]) CUDA_CHECK(cudaMalloc((void **)&rbuf_d, buf_size)); CUDA_CHECK(cudaMemset(rbuf_d, 0, buf_size)); - MP_CHECK(mp_register(sbuf_d, buf_size, &sreg)); - MP_CHECK(mp_register(rbuf_d, buf_size, &rreg)); + MP_CHECK(mp_register(sbuf_d, buf_size, &sreg, 0)); + MP_CHECK(mp_register(rbuf_d, buf_size, &rreg, 0)); if (!my_rank) { if (prof_init(&prof_normal, 1000, 1000, "1us", 100, 1, tags)) { diff --git a/comm_library/comm.cpp b/comm_library/comm.cpp index 2c53573..34390de 100644 --- a/comm_library/comm.cpp +++ b/comm_library/comm.cpp @@ -246,11 +246,11 @@ int comm_init(MPI_Comm comm, int gpuId) iomb(); DBG("registering ready_table size=%zd\n", table_size); - MP_CHECK(mp_register(ready_table, table_size, &ready_table_reg)); + MP_CHECK(mp_register(ready_table, table_size, &ready_table_reg, 0)); DBG("creating ready_table window\n"); MP_CHECK(mp_window_create(ready_table, table_size, &ready_table_win)); DBG("registering remote_ready_table\n"); - MP_CHECK(mp_register(remote_ready_values, table_size, &remote_ready_values_reg)); + MP_CHECK(mp_register(remote_ready_values, table_size, &remote_ready_values_reg, 0)); comm_initialized = 1; @@ -706,7 +706,24 @@ int comm_register(void *buf, size_t size, comm_reg_t *creg) if (!*reg) { DBG("registering buffer %p\n", buf); - MP_CHECK(mp_register(buf, size, reg)); + MP_CHECK(mp_register(buf, size, reg, 0)); + } + +out: + return ret; +} + +int comm_register_odp(comm_reg_t *creg) +{ + assert(comm_initialized); + int ret = 0; + int retcode; + mp_reg_t *reg = (mp_reg_t*)creg; + assert(reg); + + if (!*reg) { + DBG("registering implicit ODP MR\n"); + MP_CHECK(mp_register(NULL, 0, reg, IBV_EXP_ACCESS_ON_DEMAND)); } out: diff --git a/comm_library/comm.h b/comm_library/comm.h index eb76773..947b608 100644 --- a/comm_library/comm.h +++ b/comm_library/comm.h @@ -125,6 +125,7 @@ extern "C" { int comm_prepare_wait_all(int count, comm_request_t *creqs); comm_dev_descs_t comm_prepared_requests(); int comm_register(void *buf, size_t size, comm_reg_t *creg); + int comm_register_odp(comm_reg_t *creg); int comm_deregister(comm_reg_t *creg); int comm_select_device(int mpiRank); diff --git a/comm_library/examples/comm_pingpong.cpp b/comm_library/examples/comm_pingpong.cpp index 745c459..3ac82c8 100644 --- a/comm_library/examples/comm_pingpong.cpp +++ b/comm_library/examples/comm_pingpong.cpp @@ -54,14 +54,24 @@ do { \ } while (0) -comm_reg_t * sreg, * rreg; +comm_reg_t * sreg, * rreg, * odpreg; int comm_size, my_rank, device_id; unsigned char * send_buf[MAX_PEERS]; unsigned char * recv_buf[MAX_PEERS]; -int use_gpu_buffers=0; int tot_iters=MAX_ITERS; -int max_size=BUF_SIZE; +int buf_size=BUF_SIZE; +int use_gpu_buffers=0; int validate=0; +int use_odp=0; + +static void usage() +{ + printf("Options:\n"); + printf(" -g allocate GPU intead of CPU memory buffers\n"); + printf(" -o use implici ODP\n"); + printf(" -n= number of exchanges (default %d)\n", MAX_ITERS); + printf(" -s= S/R buffer size (default %d)\n", BUF_SIZE); +} int async_exchange(int iter) { int peer, n_sreqs=0, n_rreqs=0; @@ -73,8 +83,10 @@ int async_exchange(int iter) { { if(peer != my_rank) { - comm_irecv(recv_buf[peer], max_size, MPI_CHAR, &rreg[peer], peer, &recv_requests[n_rreqs]); - comm_send_ready_on_stream(peer, &ready_requests[n_rreqs], NULL); + COMM_CHECK(comm_irecv(recv_buf[peer], buf_size, MPI_CHAR, + (use_odp ? &odpreg[0] : &rreg[peer]), + peer, &recv_requests[n_rreqs])); + COMM_CHECK(comm_send_ready_on_stream(peer, &ready_requests[n_rreqs], NULL)); n_rreqs++; } } @@ -83,18 +95,20 @@ int async_exchange(int iter) { { if(peer != my_rank) { - comm_wait_ready_on_stream(peer,NULL); - comm_isend_on_stream(send_buf[peer], max_size, MPI_CHAR, - &sreg[peer], peer, &send_requests[n_sreqs], NULL); + COMM_CHECK(comm_wait_ready_on_stream(peer,NULL)); + COMM_CHECK(comm_isend_on_stream(send_buf[peer], buf_size, MPI_CHAR, + (use_odp ? &odpreg[0] : &sreg[peer]), + peer, &send_requests[n_sreqs], NULL)); n_sreqs++; } } - comm_wait_all_on_stream(n_rreqs, recv_requests, NULL); - comm_wait_all_on_stream(n_sreqs, send_requests, NULL); - //comm_wait_all_on_stream(n_rreqs, ready_requests, NULL); + COMM_CHECK(comm_wait_all_on_stream(n_rreqs, recv_requests, NULL)); + COMM_CHECK(comm_wait_all_on_stream(n_rreqs, ready_requests, NULL)); + COMM_CHECK(comm_wait_all_on_stream(n_sreqs, send_requests, NULL)); + //printf("Before progress, %d iter, %d n_rreqs, %d n_sreqs, %d comm_size\n", iter, n_rreqs, n_sreqs, comm_size); comm_progress(); } @@ -108,8 +122,10 @@ int sync_exchange(int iter) { { if(peer != my_rank) { - comm_irecv(recv_buf[peer], max_size, MPI_CHAR, &rreg[peer], peer, &recv_requests[n_rreqs]); - comm_send_ready(peer, &ready_requests[n_rreqs]); + COMM_CHECK(comm_irecv(recv_buf[peer], buf_size, MPI_CHAR, + (use_odp ? &odpreg[0] : &rreg[peer]), + peer, &recv_requests[n_rreqs])); + COMM_CHECK(comm_send_ready(peer, &ready_requests[n_rreqs])); n_rreqs++; } } @@ -118,13 +134,15 @@ int sync_exchange(int iter) { { if(peer != my_rank) { - comm_wait_ready(peer); - comm_isend(send_buf[peer], max_size, MPI_CHAR, - &sreg[peer], peer, &send_requests[n_sreqs]); + COMM_CHECK(comm_wait_ready(peer)); + COMM_CHECK(comm_isend(send_buf[peer], buf_size, MPI_CHAR, + (use_odp ? &odpreg[0] : &sreg[peer]), + peer, &send_requests[n_sreqs])); n_sreqs++; } } + comm_flush(); } @@ -132,82 +150,120 @@ int main(int argc, char **argv) { int i,j,k,iter; char *value; double tot_time, start_time, stop_time; - - value = getenv("USE_GPU_BUFFERS"); - if (value != NULL) { - use_gpu_buffers = atoi(value); - } - - value = getenv("MAX_SIZE"); - if (value != NULL) { - max_size = atoi(value); - } - - value = getenv("TOT_ITERS"); - if (value != NULL) { - tot_iters = atoi(value); - if(tot_iters > MAX_ITERS) - { - printf("ERROR: max iters number allowed=%d\n", MAX_ITERS); - tot_iters = MAX_ITERS; + int c; + + while (1) { + + c = getopt(argc, argv, "gon:s:"); + if (c == -1) + break; + + switch (c) { + case 'g': + use_gpu_buffers=1; + printf("Using GPU memory for communication buffers\n"); + break; + + case 'n': + tot_iters = strtol(optarg, NULL, 0); + if(tot_iters > MAX_ITERS) + tot_iters = MAX_ITERS; + printf("Tot iters=%d\n", tot_iters); + break; + + case 'o': + use_odp=1; + printf("Using implicit ODP\n"); + break; + + + case 's': + buf_size=strtol(optarg, NULL, 0); + printf("Using buf_size=%d\n", buf_size); + + break; + + default: + usage(); + return 1; } - } - + value = getenv("ENABLE_VALIDATION"); if (value != NULL) { validate = atoi(value); } - if(!comm_use_comm()) - fprintf(stderr, "ERROR: pingpong + one sided for comm library only\n"); - MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &comm_size); MPI_Comm_rank(MPI_COMM_WORLD, &my_rank); - assert(comm_size <= MAX_PEERS); + if (comm_size < 2 || comm_size > MAX_PEERS) { + fprintf(stderr, "this test requires 2 async sa=%d, use_gpu_buffers=%d, max_size=%d, tot_iters=%d num peers=%d validate=%d\n", - comm_use_model_sa()?1:0, use_gpu_buffers, max_size, tot_iters, comm_size, validate); + printf("# SA Model=%d\n# use_gpu_buffers=%d\n# buf_size=%d\n# tot_iters=%d\n# num peers=%d\n# validate=%d\n# use_odp=%d\n", + comm_use_model_sa()?1:0, use_gpu_buffers, buf_size, tot_iters, comm_size, validate, use_odp); start_time = MPI_Wtime(); for(iter=0; itermr = ibv_reg_mr(ib_ctx->pd, addr, length, flags); - if (!reg->mr) { - mp_err_msg("ibv_reg_mr returned NULL for addr:%p size:%zu errno=%d(%s)\n", - addr, length, errno, strerror(errno)); -#ifdef DADO_DEBUG - check_cuda_ptr(addr, length); - spin_forever(); - free(reg); - MPI_Abort(MPI_COMM_WORLD, 1); -#endif - return MP_FAILURE; - } - - reg->key = reg->mr->lkey; + } - mp_dbg_msg("reg=%p key=%x\n", reg, reg->key); + in.pd = ib_ctx->pd; + in.exp_access = flags; + in.comp_mask = 0; - *reg_ = reg; - - return MP_SUCCESS; + if(addr != NULL && length > 0) + { + CUdeviceptr base; + CUresult curesult; + + // note: register addr, not base. no advantage in registering the whole buffer as we don't + // maintain a registration cache yet + curesult = cuPointerGetAttribute((void *)&type, CU_POINTER_ATTRIBUTE_MEMORY_TYPE, (CUdeviceptr)addr); + if ((curesult == CUDA_SUCCESS) && (type == CU_MEMORYTYPE_DEVICE)) { + CU_CHECK(cuMemGetAddressRange(&base, &size, (CUdeviceptr)addr)); + + flag = 1; + CU_CHECK(cuPointerSetAttribute(&flag, CU_POINTER_ATTRIBUTE_SYNC_MEMOPS, base)); + } + + in.addr = addr; + in.length = length; + } + + mp_dbg_msg("exp_flags=%llx\n", exp_flags); + + if(exp_flags & IBV_EXP_ACCESS_ON_DEMAND) + { + dattr.comp_mask = IBV_EXP_DEVICE_ATTR_ODP | IBV_EXP_DEVICE_ATTR_EXP_CAP_FLAGS; + int ret = ibv_exp_query_device(ib_ctx->context, &dattr); + if (!(dattr.exp_device_cap_flags & IBV_EXP_DEVICE_ODP)) + { + mp_err_msg("ODP not supported!\n"); + return MP_FAILURE; + } + + //In LibMP we only support implicit ODP + if(!(dattr.odp_caps.general_odp_caps & IBV_EXP_ODP_SUPPORT_IMPLICIT)) + { + mp_err_msg("Implicit ODP not supported!\n"); + return MP_FAILURE; + } + +#if defined(__x86_64__) || defined (__i386__) + mp_warn_msg("NOTE: This implicit ODP MR can't be used with GMEM buffers on x86 systems\n"); +#endif + //Implicit On-Demand Paging is supported. + in.addr = 0; + in.length = IBV_EXP_IMPLICIT_MR_SIZE; + in.exp_access |= IBV_EXP_ACCESS_ON_DEMAND; + } + + mp_dbg_msg("ibv_exp_reg_mr addr:0 size:IBV_EXP_IMPLICIT_MR_SIZE flags=0x%08x\n", in.exp_access); + reg->mr = ibv_exp_reg_mr(&in); +// reg->mr = ibv_reg_mr(ib_ctx->pd, addr, length, flags); + + if (!reg->mr) { + mp_err_msg("ibv_reg_mr returned NULL for addr:%p size:%zu errno=%d(%s)\n", + addr, length, errno, strerror(errno)); + + #ifdef DADO_DEBUG + check_cuda_ptr(addr, length); + spin_forever(); + free(reg); + MPI_Abort(MPI_COMM_WORLD, 1); + #endif + + return MP_FAILURE; + } + + reg->key = reg->mr->lkey; + + mp_dbg_msg("reg=%p key=%x\n", reg, reg->key); + + *reg_ = reg; + + return MP_SUCCESS; } int mp_deregister(mp_reg_t *reg_) { - int ret=0; - struct mp_reg *reg = (struct mp_reg *) *reg_; + int ret=0; + struct mp_reg *reg = (struct mp_reg *) *reg_; - assert(reg); - assert(reg->mr); - ret = ibv_dereg_mr(reg->mr); - if(ret) - { + assert(reg); + assert(reg->mr); + ret = ibv_dereg_mr(reg->mr); + if(ret) + { mp_err_msg("ibv_dereg_mr returned %d\n", ret); return MP_FAILURE; - } + } - free(reg); - return MP_SUCCESS; + free(reg); + return MP_SUCCESS; } char shm_filename[100]; @@ -1206,7 +1249,7 @@ int mp_init (MPI_Comm comm, int *peers, int count, int init_flags, int gpu_id) } if (mp_enable_ud) { - int result = mp_register(ud_padding, UD_ADDITION, &ud_padding_reg); + int result = mp_register(ud_padding, UD_ADDITION, &ud_padding_reg, 0); assert(result == MP_SUCCESS); } @@ -1820,7 +1863,7 @@ int mp_window_create(void *addr, size_t size, mp_window_t *window_t) assert(exchange_win != NULL); window->reg=NULL; - result = mp_register(addr, size, &window->reg); + result = mp_register(addr, size, &window->reg, 0); assert(result == MP_SUCCESS); exchange_win[mpi_comm_rank].base_addr = addr;