From 180753d203e69ea54e96e2cee66e4500beedc12d Mon Sep 17 00:00:00 2001 From: "Yaxun (Sam) Liu" Date: Thu, 11 Sep 2025 20:39:22 +0000 Subject: [PATCH] added malloc pitch on merged pool embedding --- .../merge_pooled_embedding_ops_gpu.cpp | 78 ++++++++++++++----- 1 file changed, 57 insertions(+), 21 deletions(-) diff --git a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp index a2e0d4f4d2..5af5ba57f7 100644 --- a/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp +++ b/fbgemm_gpu/src/merge_pooled_embedding_ops/merge_pooled_embedding_ops_gpu.cpp @@ -222,15 +222,21 @@ void all_to_one( .output_idx = i, .transfer_cuda_event = std::make_unique(cudaEventDisableTiming)}); - AT_CUDA_CHECK(cudaMemcpy2DAsync( - dst.data_ptr(), - dst.stride(0) * dst.element_size(), - src.data_ptr(), - src.stride(0) * src.element_size(), - src.size(1) * src.element_size(), - src.size(0), - cudaMemcpyDeviceToDevice, - copy_stream)); + void* tmp1 = nullptr; + void* tmp2 = nullptr; + size_t srcPitchBytes; + size_t dstPitchBytes; + AT_CUDA_CHECK(cudaMallocPitch(&tmp1, &srcPitchBytes, src.size(1) * src.element_size(), src.size(0))); + AT_CUDA_CHECK(cudaMallocPitch(&tmp2, &dstPitchBytes, dst.size(1) * dst.element_size(), dst.size(0))); + AT_CUDA_CHECK(cudaMemcpy2DAsync( + dst.data_ptr(), + dstPitchBytes,//.stride(0) * dst.element_size(), + src.data_ptr(), + srcPitchBytes,//.stride(0) * src.element_size(), + src.size(1) * src.element_size(), + src.size(0), + cudaMemcpyDeviceToDevice, + at::cuda::getCurrentCUDAStream(src.get_device()))); two_hop_transfers.back().transfer_cuda_event->record(copy_stream); is_two_hop_transfer.push_back(true); } else { @@ -279,15 +285,21 @@ void all_to_one( auto& dst = output_tensors[i]; // on source device, launch memcpy. + void* tmp1 = nullptr; + void* tmp2 = nullptr; + size_t srcPitchBytes; + size_t dstPitchBytes; + AT_CUDA_CHECK(cudaMallocPitch(&tmp1, &srcPitchBytes, src.size(1) * src.element_size(), src.size(0))); + AT_CUDA_CHECK(cudaMallocPitch(&tmp2, &dstPitchBytes, dst.size(1) * dst.element_size(), dst.size(0))); AT_CUDA_CHECK(cudaMemcpy2DAsync( dst.data_ptr(), - dst.stride(0) * dst.element_size(), + dstPitchBytes,//.stride(0) * dst.element_size(), src.data_ptr(), - src.stride(0) * src.element_size(), + srcPitchBytes,//.stride(0) * src.element_size(), src.size(1) * src.element_size(), src.size(0), cudaMemcpyDeviceToDevice, - copy_stream)); + at::cuda::getCurrentCUDAStream(src.get_device()))); } } @@ -313,11 +325,17 @@ void all_to_one( const auto output_index = two_hop_transfer.output_idx; auto& dst = output_tensors.at(output_index); // on source device, launch memcpy. + void* tmp1 = nullptr; + void* tmp2 = nullptr; + size_t srcPitchBytes; + size_t dstPitchBytes; + AT_CUDA_CHECK(cudaMallocPitch(&tmp1, &srcPitchBytes, src.size(1) * src.element_size(), src.size(0))); + AT_CUDA_CHECK(cudaMallocPitch(&tmp2, &dstPitchBytes, dst.size(1) * dst.element_size(), dst.size(0))); AT_CUDA_CHECK(cudaMemcpy2DAsync( dst.data_ptr(), - dst.stride(0) * dst.element_size(), + dstPitchBytes,//dst.stride(0) * dst.element_size(), src.data_ptr(), - src.stride(0) * src.element_size(), + srcPitchBytes,//src.stride(0) * src.element_size(), src.size(1) * src.element_size(), src.size(0), cudaMemcpyDeviceToDevice, @@ -333,11 +351,17 @@ void all_to_one( // single device memcpy, not that src_device == dst_device. at::cuda::CUDAStream copy_stream = at::cuda::getCurrentCUDAStream(target_device_index); + void* tmp1 = nullptr; + void* tmp2 = nullptr; + size_t srcPitchBytes; + size_t dstPitchBytes; + AT_CUDA_CHECK(cudaMallocPitch(&tmp1, &srcPitchBytes, src.size(1) * src.element_size(), src.size(0))); + AT_CUDA_CHECK(cudaMallocPitch(&tmp2, &dstPitchBytes, dst.size(1) * dst.element_size(), dst.size(0))); AT_CUDA_CHECK(cudaMemcpy2DAsync( dst.data_ptr(), - dst.stride(0) * dst.element_size(), + dstPitchBytes,//dst.stride(0) * dst.element_size(), src.data_ptr(), - src.stride(0) * src.element_size(), + srcPitchBytes,//src.stride(0) * src.element_size(), src.size(1) * src.element_size(), src.size(0), cudaMemcpyDeviceToDevice, @@ -456,11 +480,17 @@ Tensor sum_reduce_to_one( // on source device, launch memcpy. auto& dst = copied_tensors[i]; + void* tmp1 = nullptr; + void* tmp2 = nullptr; + size_t srcPitchBytes; + size_t dstPitchBytes; + AT_CUDA_CHECK(cudaMallocPitch(&tmp1, &srcPitchBytes, src.size(1) * src.element_size(), src.size(0))); + AT_CUDA_CHECK(cudaMallocPitch(&tmp2, &dstPitchBytes, dst.size(1) * dst.element_size(), dst.size(0))); AT_CUDA_CHECK(cudaMemcpy2DAsync( dst.data_ptr(), - dst.stride(0) * dst.element_size(), + dstPitchBytes,//dst.stride(0) * dst.element_size(), src.data_ptr(), - src.stride(0) * src.element_size(), + srcPitchBytes,//src.stride(0) * src.element_size(), src.size(1) * src.element_size(), src.size(0), cudaMemcpyDeviceToDevice, @@ -538,11 +568,17 @@ Tensor sum_reduce_to_one( dst_ready.block(copy_stream); auto& dst = copied_tensors[i]; + void* tmp1 = nullptr; + void* tmp2 = nullptr; + size_t srcPitchBytes; + size_t dstPitchBytes; + AT_CUDA_CHECK(cudaMallocPitch(&tmp1, &srcPitchBytes, src.size(1) * src.element_size(), src.size(0))); + AT_CUDA_CHECK(cudaMallocPitch(&tmp2, &dstPitchBytes, dst.size(1) * dst.element_size(), dst.size(0))); AT_CUDA_CHECK(cudaMemcpy2DAsync( dst.data_ptr(), - dst.stride(0) * dst.element_size(), + dstPitchBytes,//dst.stride(0) * dst.element_size(), src.data_ptr(), - src.stride(0) * src.element_size(), + srcPitchBytes,//src.stride(0) * src.element_size(), src.size(1) * src.element_size(), src.size(0), cudaMemcpyDeviceToDevice, @@ -741,4 +777,4 @@ TORCH_LIBRARY_FRAGMENT(fbgemm, m) { "merge_pooled_embeddings", fbgemm_gpu::merge_pooled_embeddings); DISPATCH_TO_CUDA("all_to_one_device", fbgemm_gpu::all_to_one_device); DISPATCH_TO_CUDA("sum_reduce_to_one", fbgemm_gpu::sum_reduce_to_one_device); -} +} \ No newline at end of file