Skip to content

Unable to build with -O0 -mllvm -enzyme-auto-sparsity=1 #2366

@nkoukpaizan

Description

@nkoukpaizan

I am unable to build with-O0 -mllvm -enzyme-auto-sparsity=1. Higher optimization levels work.

A simple reproducer is ringspring.cpp in the sparse integration tests.

The error message is:

/app/Enzyme/enzyme/Enzyme/FunctionUtils.cpp:8041: void replaceToDense(llvm::CallBase*, bool, llvm::Function*, const llvm::DataLayout&): Assertion `SI->getValueOperand() != val' failed.

I'm including a link to compiler explorer, the source code and a the detailed error message below.

Source code for reference
// Compilation flags
// -O0 -fno-vectorize -ffast-math -fno-unroll-loops  -mllvm -enable-load-pre=0 -mllvm -enzyme-auto-sparsity=1
// Compilation fails with -O0. Works with higher optimization levels, e.g., -01.

#include <stdio.h>
#include <assert.h>
#include <vector>
#include <math.h>
#include <cmath>

#include <sys/time.h>
float tdiff(struct timeval *start, struct timeval *end) {
  return (end->tv_sec-start->tv_sec) + 1e-6*(end->tv_usec-start->tv_usec);
}

template<typename T>
struct Triple {
    size_t row;
    size_t col;
    T val;
    Triple(Triple&&) = default;
    Triple(size_t row, size_t col, T val) : row(row), col(col), val(val) {}
};

__attribute__((enzyme_sparse_accumulate))
static void inner_storeflt(int64_t row, int64_t col, float val, std::vector<Triple<float>> &triplets) {
#ifdef BENCHMARK
    if (val == 0.0) return;
#else
#warning "Compiling for debug/verfication, performance may be slowed"
#endif
    triplets.emplace_back(row, col, val);
}

__attribute__((enzyme_sparse_accumulate))
static void inner_storedbl(int64_t row, int64_t col, double val, std::vector<Triple<double>> &triplets) {
#ifdef BENCHMARK
    if (val == 0.0) return;
#else
#warning "Compiling for debug/verfication, performance may be slowed"
#endif
    triplets.emplace_back(row, col, val);
}

template<typename T>
__attribute__((always_inline))
static void sparse_store(T val, int64_t idx, size_t i, std::vector<Triple<T>> &triplets) {
    if (val == 0.0) return;
    idx /= sizeof(T);
    if constexpr (sizeof(T) == 4)
      inner_storeflt(i, idx, val, triplets);
    else
      inner_storedbl(i, idx, val, triplets);
}

template<typename T>
__attribute__((always_inline))
static T sparse_load(int64_t idx, size_t i, std::vector<Triple<T>> &triplets) {
    return 0.0;
}

template<typename T>
__attribute__((always_inline))
static void ident_store(T, int64_t idx, size_t i) {
    assert(0 && "should never load");
}

template<typename T>
__attribute__((always_inline))
static T ident_load(int64_t idx, size_t i) {
    idx /= sizeof(T);
    return (T)(idx == i);// ? 1.0 : 0.0;
}

extern int enzyme_width;
extern int enzyme_dup;
extern int enzyme_dupv;
extern int enzyme_const;
extern int enzyme_dupnoneed;

template <typename T, typename... Tys>
extern T __enzyme_autodiff(void*, Tys...) noexcept;

template <typename T, typename... Tys>
extern T __enzyme_fwddiff(void *, Tys...) noexcept;

template <typename T, typename... Tys>
extern T __enzyme_todense(Tys...) noexcept;

template <typename T, typename... Tys>
extern T __enzyme_post_sparse_todense(Tys...) noexcept;

template<typename T, size_t n>
__attribute__((always_inline))
static void elementwise_difference(T (&out)[n], const T x[n], const T y[n]) {
    #pragma clang loop unroll(full)
    for (int i=0; i<n; i++)
        out[i] = x[i] - y[i];
}

template<typename T, size_t n>
__attribute__((always_inline))
static void elementwise_sum(T (&out)[n], const T x[n], const T y[n]) {
    #pragma clang loop unroll(full)
    for (int i=0; i<n; i++)
        out[i] = x[i] + y[i];
}

template<typename T, size_t n>
__attribute__((always_inline))
static T dot_product(const T a[n], const T b[n]) {
    T result = 0.0;
    #pragma clang loop unroll(full)
    for (size_t i = 0; i < n; ++i) {
        result += a[i] * b[i];
    }
    return result;
}


template<typename T, size_t n>
__attribute__((always_inline))
static T norm(const T v[n]) {
    T sum_squares = 0.0;
    #pragma clang loop unroll(full)
    for (size_t i=0; i<n; i++) {
        T val = v[i];
        sum_squares += val * val;
    }
    return std::sqrt(sum_squares);
}

template<typename T, size_t n, size_t m>
__attribute__((always_inline))
static void transpose(T (&out)[n][m], const T in[m][n]) {
    #pragma clang loop unroll(full)
    for (int i=0; i<n; i++)
        #pragma clang loop unroll(full)
        for (int j=0; j<m; j++)
            out[i][j] = in[j][i];
}

template<typename T, size_t m, size_t n, size_t k>
__attribute__((always_inline))
static void matrix_multiply(T (&result)[m][k], const T matrix1[m][n], const T matrix2[n][k]) {
    #pragma clang loop unroll(full)
    for (int i = 0; i < m; ++i) {
        #pragma clang loop unroll(full)
        for (int j = 0; j < k; ++j) {
            result[i][j] = 0.0;
            #pragma clang loop unroll(full)
            for (int z = 0; z < n; ++z) {
                result[i][j] += matrix1[i][z] * matrix2[z][j];
            }
        }
    }
}


template<typename T>
__attribute__((always_inline))
static void inv(T (&out)[3][3], const T (&F)[3][3]) {
    T det = F[0][0] * (F[1][1] * F[2][2] - F[1][2] * F[2][1])
              - F[0][1] * (F[1][0] * F[2][2] - F[1][2] * F[2][0])
              + F[0][2] * (F[1][0] * F[2][1] - F[1][1] * F[2][0]);

    T inv_det = 1 / det;

    out[0][0] = (F[1][1] * F[2][2] - F[1][2] * F[2][1]) * inv_det;
    out[0][1] = (F[0][2] * F[2][1] - F[0][1] * F[2][2]) * inv_det;
    out[0][2] = (F[0][1] * F[1][2] - F[0][2] * F[1][1]) * inv_det;

    out[1][0] = (F[1][2] * F[2][0] - F[1][0] * F[2][2]) * inv_det;
    out[1][1] = (F[0][0] * F[2][2] - F[0][2] * F[2][0]) * inv_det;
    out[1][2] = (F[0][2] * F[1][0] - F[0][0] * F[1][2]) * inv_det;

    out[2][0] = (F[1][0] * F[2][1] - F[1][1] * F[2][0]) * inv_det;
    out[2][1] = (F[0][1] * F[2][0] - F[0][0] * F[2][1]) * inv_det;
    out[2][2] = (F[0][0] * F[1][1] - F[0][1] * F[1][0]) * inv_det;
}


template<typename T>
__attribute__((always_inline))
static void inv(T (&out)[2][2], const T (&F)[2][2]) {
    T det = F[0][0] * F[1][1] - F[0][1] * F[1][0];

    T inv_det = 1 / det;

    out[0][0] = F[1][1] * inv_det;
    out[0][1] = -F[0][1] * inv_det;
    out[1][0] = -F[1][0] * inv_det;
    out[1][1] = F[0][0] * inv_det;
}

template<typename T, size_t m, size_t n>
__attribute__((always_inline))
static void pseudo_inverse(T (&matTsqrinv)[n][m], const T mat[m][n]) {
    T matT[n][m];
    transpose(matT, mat);
    T matmatT[m][m];
    matrix_multiply(matmatT, mat, matT);
    T sqrinv[m][m];
    inv(sqrinv, matmatT);
    matrix_multiply(matTsqrinv, matT, sqrinv);
}

// m is 2 n is 3
template<typename T, int n, int m>
__attribute__((always_inline))
static void get_pos(
    T (&__restrict__ out)[n][m],
    const float *__restrict__ pos,
    const int idx[n]) {

    static_assert(m == 3, "Only Vector3 is supported");

    // extract the 3d points at idx[0], idx[1], idx[2], idx[3]
    #pragma clang loop unroll(full)
    for (int i = 0; i < n; ++i) {
        out[i][0] = pos[m * idx[i]];
        out[i][1] = pos[m * idx[i] + 1];
        out[i][2] = pos[m * idx[i] + 2];
    }
}


// m is 2 n is 3
template<typename T, int n, int m>
__attribute__((always_inline))
static void get_pos_affine(
    T (&__restrict__ out)[n][m],
    const float *__restrict__ pos) {

    static_assert(m == 3, "Only Vector3 is supported");

    // extract the 3d points at idx[0], idx[1], idx[2], idx[3]
    #pragma clang loop unroll(full)
    for (int i = 0; i < n; ++i) {
        out[i][0] = pos[m * i];
        out[i][1] = pos[m * i + 1];
        out[i][2] = pos[m * i + 2];
    }
}

template<typename T>
__attribute__((always_inline))
static void cross(T (&out)[3], const T v1[3], const T v2[3]) {
    out[0] = v1[1] * v2[2] - v1[2] * v2[1];
    out[1] = v1[2] * v2[0] - v1[0] * v2[2];
    out[2] = v1[0] * v2[1] - v1[1] * v2[0];
}


template<typename T>
__attribute__((always_inline))
static T area(const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w) {
    T diff1[3];
    elementwise_difference(diff1, v, u);
    T diff2[3];
    elementwise_difference(diff2, w, u);
    T cross_product[3];
    cross(cross_product, diff1, diff2);
    return 0.5 * norm<T, 3>(cross_product);
}   

template<typename T>
__attribute__((always_inline))
static T f(size_t N, T* input) {
    double out = 0;
    // __builtin_assume(!((N-1) == 0));
    for (size_t i=0; i<N; i++) {
        //double sub = input[i] - input[i+1]; 
        // out += sub * sub;
        T sub = (input[i+1] - input[i]) * (input[i+1] - input[i]);
        out += (sqrt(sub) - 1)*(sqrt(sub) - 1);
    }
    return out;
}

template<typename T>
__attribute__((always_inline))
static void grad_f(size_t N, T* input, T* dinput) {
    __enzyme_autodiff<void>((void*)f<T>, enzyme_const, N, enzyme_dup, input, dinput);
}

template<typename T>
__attribute__((always_inline))
double ringident_load(int64_t idx, size_t i, size_t N) {
    idx /= sizeof(double);
    // return (double)( ( (idx == N) ? 0 : idx) == i);
    return (double)((idx != N && idx == i) || (idx == N && 0 == i));
    // return (double)( idx % N == i);
}
template<typename T>
__attribute__((always_inline))
void never_store(T val, int64_t idx, T* input, size_t N) {
    assert(0 && "this is a read only input, why are you storing here...");
}

template<typename T>
__attribute__((always_inline))
double mod_load(int64_t idx, double* input, size_t N) {
    idx /= sizeof(double);
    return input[idx % N];
}

template<typename T>
__attribute__((noinline))
std::vector<Triple<T>> hess_f(size_t N, T* input) {
    std::vector<Triple<T>> triplets;
    input = __enzyme_todense<T*>((void*)mod_load<T>, (void*)never_store<T>, input, N);
    __builtin_assume(N > 0);
    __builtin_assume(N != 1);
    for (size_t i=0; i<N; i++) {
        __builtin_assume(i < 100000000);
        T* d_input = __enzyme_todense<T*>((void*)ringident_load<T>, (void*)never_store<T>, i, N);
        T* d_dinput = __enzyme_todense<T*>((void*)sparse_load<T>, (void*)sparse_store<T>, i, &triplets);

       __enzyme_fwddiff<void>((void*)grad_f<T>, 
                            enzyme_const, N,
                            enzyme_dup, input, d_input,
                            enzyme_dupnoneed, (T*)0x1, d_dinput);

    }
    return triplets;
}


int main(int argc, char** argv) {
    size_t N = 8;

    if (argc >= 2) {
         N = atoi(argv[1]);
    }

    double *x = (double*)malloc(sizeof(double) * N);
    for (int i=0; i<N; i++) x[i] = (i + 1) * (i + 1);


  struct timeval start, end;
  gettimeofday(&start, NULL);
  
  auto res = hess_f(N, x);

  gettimeofday(&end, NULL);
    
  printf("Number of elements %ld\n", res.size());
  
  printf("Runtime %0.6f\n", tdiff(&start, &end));

  if (N <= 30) {
  for (auto & tup : res)
      printf("%ld, %ld = %f\n", tup.row, tup.col, tup.val);
  }

  return 0;
}
Error message details

clang++: /app/Enzyme/enzyme/Enzyme/FunctionUtils.cpp:8041: void replaceToDense(llvm::CallBase*, bool, llvm::Function*, const llvm::DataLayout&): Assertion `SI->getValueOperand() != val' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0. Program arguments: /opt/compiler-explorer/clang-16.0.0/bin/clang++ -gdwarf-4 -g -o /tmp/compiler-explorer-compiler2025528-98-1f78sgi.0wxqk/output.s -fno-verbose-asm -c -fpass-plugin=/opt/compiler-explorer/main/ClangEnzyme-16.so -Xclang -load -Xclang /opt/compiler-explorer/main/ClangEnzyme-16.so -fcolor-diagnostics -fno-crash-diagnostics -O0 -fno-vectorize -ffast-math -fno-unroll-loops -mllvm -enable-load-pre=0 -mllvm -enzyme-auto-sparsity=1

  1. parser at end of file
  2. Optimizer
    #0 0x0000555b5e20a8df llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3ba18df)
    Build a Julia compatible plugin library #1 0x0000555b5e20890c llvm::sys::CleanupOnSignal(unsigned long) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3b9f90c)
    Fix CMake to allow relative path and add version # to library #2 0x0000555b5e15b518 CrashRecoverySignalHandler(int) CrashRecoveryContext.cpp:0:0
    Better CI for LLVM and Julia #3 0x00007f2513021520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
    Fix the incredibly stupid plug AssertingVH bug #4 0x00007f25130759fc pthread_kill (/lib/x86_64-linux-gnu/libc.so.6+0x969fc)
    Tfk maxnum #5 0x00007f2513021476 gsignal (/lib/x86_64-linux-gnu/libc.so.6+0x42476)
    Incorrect derivative on -O0 #6 0x00007f25130077f3 abort (/lib/x86_64-linux-gnu/libc.so.6+0x287f3)
    Fix active variable bug #7 0x00007f251300771b (/lib/x86_64-linux-gnu/libc.so.6+0x2871b)
    Compiler crash with cannot deal with ptr that isn't arg #8 0x00007f2513018e96 (/lib/x86_64-linux-gnu/libc.so.6+0x39e96)
    Handle select of pointers #9 0x00007f2512a3265b replaceToDense(llvm::CallBase*, bool, llvm::Function*, llvm::DataLayout const&) /app/Enzyme/enzyme/Enzyme/FunctionUtils.cpp:8042:62
    Disable preprocessing optimizations for most tests #10 0x00007f2512a3330b LowerSparsification(llvm::Function*, bool) /app/Enzyme/enzyme/Enzyme/FunctionUtils.cpp:8128:3
    Add memcpy for pointers and floats #11 0x00007f25128e65e8 (anonymous namespace)::EnzymeBase::lowerEnzymeCalls(llvm::Function&, std::set<llvm::Function*, std::lessllvm::Function*, std::allocatorllvm::Function*>&) /app/Enzyme/enzyme/Enzyme/Enzyme.cpp:2926:47
    More general for loop structure #12 0x00007f25128e73b5 (anonymous namespace)::EnzymeBase::run(llvm::Module&) /app/Enzyme/enzyme/Enzyme/Enzyme.cpp:3057:15
    Cleanup files and split for organization #13 0x00007f25128efd92 EnzymeNewPM::run(llvm::Module&, llvm::AnalysisManagerllvm::Module&) /app/Enzyme/enzyme/Enzyme/Enzyme.cpp:3318:56
    More loop stability (including multiple exit loops) #14 0x00007f2512932d4f llvm::detail::PassModel<llvm::Module, EnzymeNewPM, llvm::PreservedAnalyses, llvm::AnalysisManagerllvm::Module>::run(llvm::Module&, llvm::AnalysisManagerllvm::Module&) /opt/compiler-explorer/clang-16.0.0/include/llvm/IR/PassManagerInternal.h:90:3
    add functional c tests via check-enzyme-c command #15 0x0000555b5dabbfb9 llvm::PassManager<llvm::Module, llvm::AnalysisManagerllvm::Module>::run(llvm::Module&, llvm::AnalysisManagerllvm::Module&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3452fb9)
    Fix calling convention and other small bugs #16 0x0000555b5e5edb44 (anonymous namespace)::EmitAssemblyHelper::RunOptimizationPipeline(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_deletellvm::raw_pwrite_stream>&, std::unique_ptr<llvm::ToolOutputFile, std::default_deletellvm::ToolOutputFile>&) BackendUtil.cpp:0:0
    Ensure increment of canonical IV dominates uses in loops #17 0x0000555b5e5f0bd6 clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::StringRef, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_deletellvm::raw_pwrite_stream>) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3f87bd6)
    cacheloads flag related fixes so insertsort_alt works with cacheloads=true #18 0x0000555b5f4fb238 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x4e92238)
    Fixtfkbranch #19 0x0000555b5ee10b50 clang::MultiplexConsumer::HandleTranslationUnit(clang::ASTContext&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x47a7b50)
    [WIP] Handle Non-outermost Dynamic Loops #20 0x0000555b6071fd5d clang::ParseAST(clang::Sema&, bool, bool) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x60b6d5d)
    Selective cachereads  #21 0x0000555b5f4fa865 clang::CodeGenAction::ExecuteAction() (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x4e91865)
    Handle Triangular Loops #22 0x0000555b5edd6671 clang::FrontendAction::Execute() (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x476d671)
    add readwriteread ll test #23 0x0000555b5ed5a993 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x46f1993)
    Global variables (no testcase yet) #24 0x0000555b5eebab3b clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x4851b3b)
    Differential returns that are pointer types #25 0x0000555b5b9e341c cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x137a41c)
    Active/Inactive determination (Tim, with input from Billy) #26 0x0000555b5b9debdc ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) driver.cpp:0:0
    Do not do replaceFunction on a pointer return type #27 0x0000555b5ebbbce9 void llvm::function_ref<void ()>::callback_fn<clang::driver::CC1Command::Execute(llvm::ArrayRef<std::optionalllvm::StringRef>, std::__cxx11::basic_string<char, std::char_traits, std::allocator>, bool) const::'lambda'()>(long) Job.cpp:0:0
    Handle use of a better calling convention for combined forward/reverse #28 0x0000555b5e15b977 llvm::CrashRecoveryContext::RunSafely(llvm::function_ref<void ()>) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3af2977)
    add differential pointer return c test #29 0x0000555b5ebbbf1c clang::driver::CC1Command::Execute(llvm::ArrayRef<std::optionalllvm::StringRef>, std::__cxx11::basic_string<char, std::char_traits, std::allocator>, bool) const (.part.0) Job.cpp:0:0
    Better global handling and fix caching bug #30 0x0000555b5eb843fc clang::driver::Compilation::ExecuteCommand(clang::driver::Command const&, clang::driver::Command const*&, bool) const (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x451b3fc)
    Intsecretpointerpartialfix #31 0x0000555b5eb84ded clang::driver::Compilation::ExecuteJobs(clang::driver::JobList const&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*>>&, bool) const (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x451bded)
    Tx2 #32 0x0000555b5eb8f23c clang::driver::Driver::ExecuteCompilation(clang::driver::Compilation&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*>>&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x452623c)
    More advanced activity analysis and many other bugfixes #33 0x0000555b5b9e1523 clang_main(int, char**) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x1378523)
    Large working changes #34 0x00007f2513008d90 (/lib/x86_64-linux-gnu/libc.so.6+0x29d90)
    PHINode (and generally return value), doesn't presently assert failure if no arguments are active if phi is active #35 0x00007f2513008e40 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x29e40)
    Import Julia source and redo CI for 1.3 #36 0x0000555b5b9dad1e _start (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x1371d1e)
    clang-16: error: clang frontend command failed with exit code 134 (use -v to see invocation)
    Compiler returned: 134

CC @pelesh @wsmoses

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions