Differentiate a "body" function with a template loop pattern on GPU #1654

bemichel · 2024-02-01T16:14:02Z

bemichel
Feb 1, 2024

Hi,

Sorry to re-open the subject (cf. #1565), but I try now to upgrade the CPU version above to a GPU's one for the backward mode.

CPU implementation

Here, you will find the updated above CPU example for forward and backward mode with functor instead of lambda function : https://fwd.gymni.ch/zEqXXH

The approach wrote on the CPU example is really generic; I now can defined any functor and it will be differentiated by calling :

__enzyme_fwddiff<void>((void*)loop_on_dof_cpu<decltype(f)>, nb_cell, enzyme_dup, (void*)&f, (void*)&df);

for the forward mode or

__enzyme_autodiff<void>((void*)loop_on_dof_cpu<decltype(f)>, nb_cell, enzyme_dup, (void*)&f, (void*)&df);

for the backward mode.

GPU implementation

I really does not known yet if it will be possible to use this defined loop+functor approach on GPU.

Acording to the documentation on GPU, a naive transcription of the CPU version into CUDA will not work, because of the call of loop_on_dof_gpu<<<grd_topo, blk_topo>>>(nb_cell, f); : https://fwd.gymni.ch/5CWooa

To do it right, I probably need to use Enzyme’s custom derivative registration to define a custom forward and reverse pass for the wrapper function of compute_gpu as follows : https://fwd.gymni.ch/murZk4

#include <cstdio>
#include <cassert>
#include <functional>
#include <vector>
#include <iostream>

#define __INLINE__ inline __attribute__((always_inline))

#define DECLARE_FUNCTION \
__host__ __device__      \
__INLINE__

// --------------------------------------------------------------------
DECLARE_FUNCTION
void knl_simple_body0(
    const int                  tx,
    const double* __restrict__ a,
          double* __restrict__ b)
{
    b[tx] = 3.*a[tx]*a[tx] + 5.;
}

struct SimpleFunctor {
    SimpleFunctor(const double* a, double* b) : _a(a), _b(b) {}

    DECLARE_FUNCTION
    void compute(int idx) {
        knl_simple_body0(idx, _a, _b);
    }
protected:
    const double* _a;
          double* _b;
};

// --------------------------------------------------------------------
template<typename F>
DECLARE_FUNCTION
void loop_on_dof_gpu_body(const int nb_cell, F& f) {
    int tx = threadIdx.x + blockIdx.x * blockDim.x;

    if (tx < nb_cell) {
        f.compute(tx);
    }
}

template<typename F>
__global__
void loop_on_dof_gpu(const int nb_cell, F f) { /// WARNING: not F& or F&&
    loop_on_dof_gpu_body(nb_cell, f);
}

template<typename F>
void compute_gpu(const int nb_cell, F& f) {
    // Target --> Feed Block and Threads
    const int n_thread_per_block = 8;
    int n_blks = nb_cell/n_thread_per_block+1;
    dim3 blk_topo = dim3(n_thread_per_block,1,1);
    dim3 grd_topo = dim3(n_blks            ,1,1);

    loop_on_dof_gpu<<<grd_topo, blk_topo>>>(nb_cell, f);
}

// --------------------------------------------------------------------
#if defined(ENABLE_ENZYME)
int enzyme_dup;
int enzyme_dupnoneed;
int enzyme_out;
int enzyme_const;

template < typename return_type, typename ... T >
return_type __enzyme_autodiff(void*, T ... );

template <typename... Args>
__device__ void* __enzyme_augmentfwd(Args...);

template <typename... Args>
__device__ void __enzyme_reverse(Args...);

template<typename F>
__global__ void aug_loop_on_dof_gpu(const int nb_cell, F f, F df, void** tape) {
    size_t idx = threadIdx.x;
    tape[idx] = __enzyme_augmentfwd((void*)loop_on_dof_gpu_body<F>, nb_cell, (void*)&f, (void*)&df);
}

template<typename F>
void* aug_compute_gpu(const int nb_cell, F& f, F& df) {
    const int n_thread_per_block = 8;
    int n_blks = nb_cell/n_thread_per_block+1;
    dim3 blk_topo = dim3(n_thread_per_block,1,1);
    dim3 grd_topo = dim3(n_blks            ,1,1);

    void** tape;
    cudaMalloc(&tape, sizeof(void*) * n_thread_per_block);
    aug_loop_on_dof_gpu<<<grd_topo, blk_topo>>>(nb_cell, f, df, tape);
    return (void*)tape;
}

template<typename F>
__global__ void rev_loop_on_dof_gpu(const int nb_cell, F f, F df, void** tape) {
    size_t idx = threadIdx.x;
    __enzyme_reverse((void*)loop_on_dof_gpu_body<F>, nb_cell, (void*)&f, (void*)&df, tape[idx]);
}

template<typename F>
void rev_compute_gpu(const int nb_cell, F& f, F& df, void* tape) {
    const int n_thread_per_block = 8;
    int n_blks = nb_cell/n_thread_per_block+1;
    dim3 blk_topo = dim3(n_thread_per_block,1,1);
    dim3 grd_topo = dim3(n_blks            ,1,1);

    rev_loop_on_dof_gpu<<<grd_topo, blk_topo>>>(nb_cell, f, df, (void**)tape);
    cudaFree(tape);
}

void* __enzyme_register_gradient_compute_gpu_simple[3] = { (void*)compute_gpu<SimpleFunctor>,
                                                           (void*)aug_compute_gpu<SimpleFunctor>,
                                                           (void*)rev_compute_gpu<SimpleFunctor> };
#endif /// ENABLE_ENZYME

// --------------------------------------------------------------------
int main(int argc, char *argv[])
{
    printf("argc == %d\n", argc);
    const int nb_cell = 16;

    double* a = (double*) malloc(nb_cell * sizeof(double));
    double* b = (double*) malloc(nb_cell * sizeof(double));

    for (int tx = 0; tx < nb_cell; tx++) {
        a[tx] = 12.;
        b[tx] = 0.;
    }

    // GPU version (Primal)
    // --------------------
    double *a_d, *b_d;
    cudaMalloc(&a_d, sizeof(*a_d)*nb_cell);
    cudaMalloc(&b_d, sizeof(*b_d)*nb_cell);

    cudaMemcpy(a_d, a, sizeof(*a_d)*nb_cell, cudaMemcpyHostToDevice);
    cudaMemcpy(b_d, b, sizeof(*b_d)*nb_cell, cudaMemcpyHostToDevice);

    auto f_gpu = SimpleFunctor(a_d, b_d);
    compute_gpu(nb_cell, f_gpu);

    cudaMemcpy(a, a_d, sizeof(*a_d)*nb_cell, cudaMemcpyDeviceToHost);
    cudaMemcpy(b, b_d, sizeof(*b_d)*nb_cell, cudaMemcpyDeviceToHost);

    printf("[GPU, direct] a[0]         == %f\n", a[0]);
    printf("[GPU, direct] a[nb_cell-1] == %f\n", a[nb_cell-1]);
    printf("[GPU, direct] b[0]         == %f\n", b[0]);
    printf("[GPU, direct] b[nb_cell-1] == %f\n", b[nb_cell-1]);
    assert(a[0] == 12.);
    assert(a[nb_cell-1] == 12.);
    assert(b[0] == 437.);
    assert(b[nb_cell-1] == 437.);

#if defined(ENABLE_ENZYME)
    // GPU version (backward)
    // ----------------------
    // b(a) = 3.*a*a + 5.
    // db/da = (3.*2*a)*db
    double* da = (double*) malloc(nb_cell * sizeof(double));
    double* db = (double*) malloc(nb_cell * sizeof(double));
    for (int tx = 0; tx < nb_cell; tx++) {
        da[tx] = 0.;
        db[tx] = 1.;
    }

    double *da_d, *db_d;
    cudaMalloc(&da_d, sizeof(*da_d)*nb_cell);
    cudaMalloc(&db_d, sizeof(*db_d)*nb_cell);

    cudaMemcpy(da_d, da, sizeof(*da_d)*nb_cell, cudaMemcpyHostToDevice);
    cudaMemcpy(db_d, db, sizeof(*db_d)*nb_cell, cudaMemcpyHostToDevice);

    auto df_gpu = SimpleFunctor(da_d, db_d);
    __enzyme_autodiff<void>((void*)compute_gpu<SimpleFunctor>, nb_cell, enzyme_dup, (void*)&f_gpu, (void*)&df_gpu);

    cudaMemcpy(da, da_d, sizeof(*a_d)*nb_cell, cudaMemcpyDeviceToHost);
    cudaMemcpy(db, db_d, sizeof(*b_d)*nb_cell, cudaMemcpyDeviceToHost);

    printf("[GPU, bwd] da[0]         == %f\n", da[0]);
    printf("[GPU, bwd] da[nb_cell-1] == %f\n", da[nb_cell-1]);
    printf("[GPU, bwd] db[0]         == %f\n", db[0]);
    printf("[GPU, bwd] db[nb_cell-1] == %f\n", db[nb_cell-1]);

    assert(da[0] == (3.*2.*12.*1.));
    assert(da[nb_cell-1] == (3.*2.*12.*1.));
    assert(db[0] == 0.);
    assert(db[nb_cell-1] == 0.);

    cudaFree(da_d);
    cudaFree(db_d);
    free(da);
    free(db);
#endif /// ENABLE_ENZYME

    cudaFree(a_d);
    cudaFree(b_d);
    free(a);
    free(b);

    return 1;
}

NB: In fact, I'm having trouble creating the appropriate environment (llvm-14+cuda-11.2+clang-14+enzyme-0.81), so i do not have any opportunity to test the last code.

Am I on the right track to being able to calculate the gradient with Enzyme on GPU ?

If I am completly wrong, can you highlight to me if this approach with a defined loop+functor has any chance of working on GPU ?

If it is good :

the difference with the CPU, will be the registration of the void* __enzyme_register_gradient_compute_gpu_...[3] = {...} which would be perfect !!!
if i want to compute the forward mode only, can i call the method __enzyme_fwddiff<void>((void*)compute_gpu<SimpleFunctor>, nb_cell, enzyme_dup, (void*)&f_gpu, (void*)&df_gpu); as done on CPU ?

Thanks a lot for your precious help.
Best regards

bemichel · 2024-02-02T21:04:39Z

bemichel
Feb 2, 2024
Author

Hi,
I finally be able to test the proposed implementation with a defined loop+functor and it works perfectly fine (clang-14+Enzyme-0.081-CUDA-11.2) on backward mode : https://fwd.gymni.ch/eQU8MJ

$> ./a.out                                                                                                                     
[GPU, direct] a[0]         == 12.000000                                                                                     
[GPU, direct] a[nb_cell-1] == 12.000000                                                                                     
[GPU, direct] b[0]         == 437.000000                                                                                    
[GPU, direct] b[nb_cell-1] == 437.000000
[GPU, backward] da[0]         == 72.000000
[GPU, backward] da[nb_cell-1] == 72.000000
[GPU, backward] db[0]         == 0.000000
[GPU, backward] db[nb_cell-1] == 0.000000

I just adapted the first version with shadow parameter for nb_cell as suggest #267.
I know have trouble to obtain the forward mode only with this approach : #1655
You can probably close this discussion.

0 replies

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Differentiate a "body" function with a template loop pattern on GPU #1654

Uh oh!

{{title}}

Uh oh!

Replies: 1 comment

Uh oh!

{{title}}

Uh oh!

Select a reply

Uh oh!

Differentiate a "body" function with a template loop pattern on GPU #1654

Uh oh!

bemichel Feb 1, 2024

CPU implementation

GPU implementation

Replies: 1 comment

Uh oh!

bemichel Feb 2, 2024 Author

bemichel
Feb 1, 2024

bemichel
Feb 2, 2024
Author