Unable to build with `-O0 -mllvm -enzyme-auto-sparsity=1`

I am unable to build with`-O0 -mllvm -enzyme-auto-sparsity=1`.  Higher optimization levels work.

A simple reproducer is [ringspring.cpp](https://github.com/EnzymeAD/Enzyme/blob/main/enzyme/test/Integration/Sparse/ringspring.cpp) in the sparse integration tests.

The error message is:
```
/app/Enzyme/enzyme/Enzyme/FunctionUtils.cpp:8041: void replaceToDense(llvm::CallBase*, bool, llvm::Function*, const llvm::DataLayout&): Assertion `SI->getValueOperand() != val' failed.
```

I'm including a [link to compiler explorer](https://fwd.gymni.ch/wrYe0n), the source code and a the detailed error message below.

<details>

<summary>Source code for reference </summary>

```cpp
// Compilation flags
// -O0 -fno-vectorize -ffast-math -fno-unroll-loops  -mllvm -enable-load-pre=0 -mllvm -enzyme-auto-sparsity=1
// Compilation fails with -O0. Works with higher optimization levels, e.g., -01.

#include <stdio.h>
#include <assert.h>
#include <vector>
#include <math.h>
#include <cmath>

#include <sys/time.h>
float tdiff(struct timeval *start, struct timeval *end) {
  return (end->tv_sec-start->tv_sec) + 1e-6*(end->tv_usec-start->tv_usec);
}

template<typename T>
struct Triple {
    size_t row;
    size_t col;
    T val;
    Triple(Triple&&) = default;
    Triple(size_t row, size_t col, T val) : row(row), col(col), val(val) {}
};

__attribute__((enzyme_sparse_accumulate))
static void inner_storeflt(int64_t row, int64_t col, float val, std::vector<Triple<float>> &triplets) {
#ifdef BENCHMARK
    if (val == 0.0) return;
#else
#warning "Compiling for debug/verfication, performance may be slowed"
#endif
    triplets.emplace_back(row, col, val);
}

__attribute__((enzyme_sparse_accumulate))
static void inner_storedbl(int64_t row, int64_t col, double val, std::vector<Triple<double>> &triplets) {
#ifdef BENCHMARK
    if (val == 0.0) return;
#else
#warning "Compiling for debug/verfication, performance may be slowed"
#endif
    triplets.emplace_back(row, col, val);
}

template<typename T>
__attribute__((always_inline))
static void sparse_store(T val, int64_t idx, size_t i, std::vector<Triple<T>> &triplets) {
    if (val == 0.0) return;
    idx /= sizeof(T);
    if constexpr (sizeof(T) == 4)
      inner_storeflt(i, idx, val, triplets);
    else
      inner_storedbl(i, idx, val, triplets);
}

template<typename T>
__attribute__((always_inline))
static T sparse_load(int64_t idx, size_t i, std::vector<Triple<T>> &triplets) {
    return 0.0;
}

template<typename T>
__attribute__((always_inline))
static void ident_store(T, int64_t idx, size_t i) {
    assert(0 && "should never load");
}

template<typename T>
__attribute__((always_inline))
static T ident_load(int64_t idx, size_t i) {
    idx /= sizeof(T);
    return (T)(idx == i);// ? 1.0 : 0.0;
}

extern int enzyme_width;
extern int enzyme_dup;
extern int enzyme_dupv;
extern int enzyme_const;
extern int enzyme_dupnoneed;

template <typename T, typename... Tys>
extern T __enzyme_autodiff(void*, Tys...) noexcept;

template <typename T, typename... Tys>
extern T __enzyme_fwddiff(void *, Tys...) noexcept;

template <typename T, typename... Tys>
extern T __enzyme_todense(Tys...) noexcept;

template <typename T, typename... Tys>
extern T __enzyme_post_sparse_todense(Tys...) noexcept;

template<typename T, size_t n>
__attribute__((always_inline))
static void elementwise_difference(T (&out)[n], const T x[n], const T y[n]) {
    #pragma clang loop unroll(full)
    for (int i=0; i<n; i++)
        out[i] = x[i] - y[i];
}

template<typename T, size_t n>
__attribute__((always_inline))
static void elementwise_sum(T (&out)[n], const T x[n], const T y[n]) {
    #pragma clang loop unroll(full)
    for (int i=0; i<n; i++)
        out[i] = x[i] + y[i];
}

template<typename T, size_t n>
__attribute__((always_inline))
static T dot_product(const T a[n], const T b[n]) {
    T result = 0.0;
    #pragma clang loop unroll(full)
    for (size_t i = 0; i < n; ++i) {
        result += a[i] * b[i];
    }
    return result;
}


template<typename T, size_t n>
__attribute__((always_inline))
static T norm(const T v[n]) {
    T sum_squares = 0.0;
    #pragma clang loop unroll(full)
    for (size_t i=0; i<n; i++) {
        T val = v[i];
        sum_squares += val * val;
    }
    return std::sqrt(sum_squares);
}

template<typename T, size_t n, size_t m>
__attribute__((always_inline))
static void transpose(T (&out)[n][m], const T in[m][n]) {
    #pragma clang loop unroll(full)
    for (int i=0; i<n; i++)
        #pragma clang loop unroll(full)
        for (int j=0; j<m; j++)
            out[i][j] = in[j][i];
}

template<typename T, size_t m, size_t n, size_t k>
__attribute__((always_inline))
static void matrix_multiply(T (&result)[m][k], const T matrix1[m][n], const T matrix2[n][k]) {
    #pragma clang loop unroll(full)
    for (int i = 0; i < m; ++i) {
        #pragma clang loop unroll(full)
        for (int j = 0; j < k; ++j) {
            result[i][j] = 0.0;
            #pragma clang loop unroll(full)
            for (int z = 0; z < n; ++z) {
                result[i][j] += matrix1[i][z] * matrix2[z][j];
            }
        }
    }
}


template<typename T>
__attribute__((always_inline))
static void inv(T (&out)[3][3], const T (&F)[3][3]) {
    T det = F[0][0] * (F[1][1] * F[2][2] - F[1][2] * F[2][1])
              - F[0][1] * (F[1][0] * F[2][2] - F[1][2] * F[2][0])
              + F[0][2] * (F[1][0] * F[2][1] - F[1][1] * F[2][0]);

    T inv_det = 1 / det;

    out[0][0] = (F[1][1] * F[2][2] - F[1][2] * F[2][1]) * inv_det;
    out[0][1] = (F[0][2] * F[2][1] - F[0][1] * F[2][2]) * inv_det;
    out[0][2] = (F[0][1] * F[1][2] - F[0][2] * F[1][1]) * inv_det;

    out[1][0] = (F[1][2] * F[2][0] - F[1][0] * F[2][2]) * inv_det;
    out[1][1] = (F[0][0] * F[2][2] - F[0][2] * F[2][0]) * inv_det;
    out[1][2] = (F[0][2] * F[1][0] - F[0][0] * F[1][2]) * inv_det;

    out[2][0] = (F[1][0] * F[2][1] - F[1][1] * F[2][0]) * inv_det;
    out[2][1] = (F[0][1] * F[2][0] - F[0][0] * F[2][1]) * inv_det;
    out[2][2] = (F[0][0] * F[1][1] - F[0][1] * F[1][0]) * inv_det;
}


template<typename T>
__attribute__((always_inline))
static void inv(T (&out)[2][2], const T (&F)[2][2]) {
    T det = F[0][0] * F[1][1] - F[0][1] * F[1][0];

    T inv_det = 1 / det;

    out[0][0] = F[1][1] * inv_det;
    out[0][1] = -F[0][1] * inv_det;
    out[1][0] = -F[1][0] * inv_det;
    out[1][1] = F[0][0] * inv_det;
}

template<typename T, size_t m, size_t n>
__attribute__((always_inline))
static void pseudo_inverse(T (&matTsqrinv)[n][m], const T mat[m][n]) {
    T matT[n][m];
    transpose(matT, mat);
    T matmatT[m][m];
    matrix_multiply(matmatT, mat, matT);
    T sqrinv[m][m];
    inv(sqrinv, matmatT);
    matrix_multiply(matTsqrinv, matT, sqrinv);
}

// m is 2 n is 3
template<typename T, int n, int m>
__attribute__((always_inline))
static void get_pos(
    T (&__restrict__ out)[n][m],
    const float *__restrict__ pos,
    const int idx[n]) {

    static_assert(m == 3, "Only Vector3 is supported");

    // extract the 3d points at idx[0], idx[1], idx[2], idx[3]
    #pragma clang loop unroll(full)
    for (int i = 0; i < n; ++i) {
        out[i][0] = pos[m * idx[i]];
        out[i][1] = pos[m * idx[i] + 1];
        out[i][2] = pos[m * idx[i] + 2];
    }
}


// m is 2 n is 3
template<typename T, int n, int m>
__attribute__((always_inline))
static void get_pos_affine(
    T (&__restrict__ out)[n][m],
    const float *__restrict__ pos) {

    static_assert(m == 3, "Only Vector3 is supported");

    // extract the 3d points at idx[0], idx[1], idx[2], idx[3]
    #pragma clang loop unroll(full)
    for (int i = 0; i < n; ++i) {
        out[i][0] = pos[m * i];
        out[i][1] = pos[m * i + 1];
        out[i][2] = pos[m * i + 2];
    }
}

template<typename T>
__attribute__((always_inline))
static void cross(T (&out)[3], const T v1[3], const T v2[3]) {
    out[0] = v1[1] * v2[2] - v1[2] * v2[1];
    out[1] = v1[2] * v2[0] - v1[0] * v2[2];
    out[2] = v1[0] * v2[1] - v1[1] * v2[0];
}


template<typename T>
__attribute__((always_inline))
static T area(const T *__restrict__ u, const T *__restrict__ v, const T *__restrict__ w) {
    T diff1[3];
    elementwise_difference(diff1, v, u);
    T diff2[3];
    elementwise_difference(diff2, w, u);
    T cross_product[3];
    cross(cross_product, diff1, diff2);
    return 0.5 * norm<T, 3>(cross_product);
}   

template<typename T>
__attribute__((always_inline))
static T f(size_t N, T* input) {
    double out = 0;
    // __builtin_assume(!((N-1) == 0));
    for (size_t i=0; i<N; i++) {
        //double sub = input[i] - input[i+1]; 
        // out += sub * sub;
        T sub = (input[i+1] - input[i]) * (input[i+1] - input[i]);
        out += (sqrt(sub) - 1)*(sqrt(sub) - 1);
    }
    return out;
}

template<typename T>
__attribute__((always_inline))
static void grad_f(size_t N, T* input, T* dinput) {
    __enzyme_autodiff<void>((void*)f<T>, enzyme_const, N, enzyme_dup, input, dinput);
}

template<typename T>
__attribute__((always_inline))
double ringident_load(int64_t idx, size_t i, size_t N) {
    idx /= sizeof(double);
    // return (double)( ( (idx == N) ? 0 : idx) == i);
    return (double)((idx != N && idx == i) || (idx == N && 0 == i));
    // return (double)( idx % N == i);
}
template<typename T>
__attribute__((always_inline))
void never_store(T val, int64_t idx, T* input, size_t N) {
    assert(0 && "this is a read only input, why are you storing here...");
}

template<typename T>
__attribute__((always_inline))
double mod_load(int64_t idx, double* input, size_t N) {
    idx /= sizeof(double);
    return input[idx % N];
}

template<typename T>
__attribute__((noinline))
std::vector<Triple<T>> hess_f(size_t N, T* input) {
    std::vector<Triple<T>> triplets;
    input = __enzyme_todense<T*>((void*)mod_load<T>, (void*)never_store<T>, input, N);
    __builtin_assume(N > 0);
    __builtin_assume(N != 1);
    for (size_t i=0; i<N; i++) {
        __builtin_assume(i < 100000000);
        T* d_input = __enzyme_todense<T*>((void*)ringident_load<T>, (void*)never_store<T>, i, N);
        T* d_dinput = __enzyme_todense<T*>((void*)sparse_load<T>, (void*)sparse_store<T>, i, &triplets);

       __enzyme_fwddiff<void>((void*)grad_f<T>, 
                            enzyme_const, N,
                            enzyme_dup, input, d_input,
                            enzyme_dupnoneed, (T*)0x1, d_dinput);

    }
    return triplets;
}


int main(int argc, char** argv) {
    size_t N = 8;

    if (argc >= 2) {
         N = atoi(argv[1]);
    }

    double *x = (double*)malloc(sizeof(double) * N);
    for (int i=0; i<N; i++) x[i] = (i + 1) * (i + 1);


  struct timeval start, end;
  gettimeofday(&start, NULL);
  
  auto res = hess_f(N, x);

  gettimeofday(&end, NULL);
    
  printf("Number of elements %ld\n", res.size());
  
  printf("Runtime %0.6f\n", tdiff(&start, &end));

  if (N <= 30) {
  for (auto & tup : res)
      printf("%ld, %ld = %f\n", tup.row, tup.col, tup.val);
  }

  return 0;
}
```
</details>

<details>

<summary>Error message details </summary>

clang++: /app/Enzyme/enzyme/Enzyme/FunctionUtils.cpp:8041: void replaceToDense(llvm::CallBase*, bool, llvm::Function*, const llvm::DataLayout&): Assertion `SI->getValueOperand() != val' failed.
PLEASE submit a bug report to https://github.com/llvm/llvm-project/issues/ and include the crash backtrace, preprocessed source, and associated run script.
Stack dump:
0.	Program arguments: /opt/compiler-explorer/clang-16.0.0/bin/clang++ -gdwarf-4 -g -o /tmp/compiler-explorer-compiler2025528-98-1f78sgi.0wxqk/output.s -fno-verbose-asm -c -fpass-plugin=/opt/compiler-explorer/main/ClangEnzyme-16.so -Xclang -load -Xclang /opt/compiler-explorer/main/ClangEnzyme-16.so -fcolor-diagnostics -fno-crash-diagnostics -O0 -fno-vectorize -ffast-math -fno-unroll-loops -mllvm -enable-load-pre=0 -mllvm -enzyme-auto-sparsity=1 <source>
1.	<eof> parser at end of file
2.	Optimizer
 #0 0x0000555b5e20a8df llvm::sys::PrintStackTrace(llvm::raw_ostream&, int) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3ba18df)
 #1 0x0000555b5e20890c llvm::sys::CleanupOnSignal(unsigned long) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3b9f90c)
 #2 0x0000555b5e15b518 CrashRecoverySignalHandler(int) CrashRecoveryContext.cpp:0:0
 #3 0x00007f2513021520 (/lib/x86_64-linux-gnu/libc.so.6+0x42520)
 #4 0x00007f25130759fc pthread_kill (/lib/x86_64-linux-gnu/libc.so.6+0x969fc)
 #5 0x00007f2513021476 gsignal (/lib/x86_64-linux-gnu/libc.so.6+0x42476)
 #6 0x00007f25130077f3 abort (/lib/x86_64-linux-gnu/libc.so.6+0x287f3)
 #7 0x00007f251300771b (/lib/x86_64-linux-gnu/libc.so.6+0x2871b)
 #8 0x00007f2513018e96 (/lib/x86_64-linux-gnu/libc.so.6+0x39e96)
 #9 0x00007f2512a3265b replaceToDense(llvm::CallBase*, bool, llvm::Function*, llvm::DataLayout const&) /app/Enzyme/enzyme/Enzyme/FunctionUtils.cpp:8042:62
#10 0x00007f2512a3330b LowerSparsification(llvm::Function*, bool) /app/Enzyme/enzyme/Enzyme/FunctionUtils.cpp:8128:3
#11 0x00007f25128e65e8 (anonymous namespace)::EnzymeBase::lowerEnzymeCalls(llvm::Function&, std::set<llvm::Function*, std::less<llvm::Function*>, std::allocator<llvm::Function*>>&) /app/Enzyme/enzyme/Enzyme/Enzyme.cpp:2926:47
#12 0x00007f25128e73b5 (anonymous namespace)::EnzymeBase::run(llvm::Module&) /app/Enzyme/enzyme/Enzyme/Enzyme.cpp:3057:15
#13 0x00007f25128efd92 EnzymeNewPM::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) /app/Enzyme/enzyme/Enzyme/Enzyme.cpp:3318:56
#14 0x00007f2512932d4f llvm::detail::PassModel<llvm::Module, EnzymeNewPM, llvm::PreservedAnalyses, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) /opt/compiler-explorer/clang-16.0.0/include/llvm/IR/PassManagerInternal.h:90:3
#15 0x0000555b5dabbfb9 llvm::PassManager<llvm::Module, llvm::AnalysisManager<llvm::Module>>::run(llvm::Module&, llvm::AnalysisManager<llvm::Module>&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3452fb9)
#16 0x0000555b5e5edb44 (anonymous namespace)::EmitAssemblyHelper::RunOptimizationPipeline(clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream>>&, std::unique_ptr<llvm::ToolOutputFile, std::default_delete<llvm::ToolOutputFile>>&) BackendUtil.cpp:0:0
#17 0x0000555b5e5f0bd6 clang::EmitBackendOutput(clang::DiagnosticsEngine&, clang::HeaderSearchOptions const&, clang::CodeGenOptions const&, clang::TargetOptions const&, clang::LangOptions const&, llvm::StringRef, llvm::Module*, clang::BackendAction, std::unique_ptr<llvm::raw_pwrite_stream, std::default_delete<llvm::raw_pwrite_stream>>) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3f87bd6)
#18 0x0000555b5f4fb238 clang::BackendConsumer::HandleTranslationUnit(clang::ASTContext&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x4e92238)
#19 0x0000555b5ee10b50 clang::MultiplexConsumer::HandleTranslationUnit(clang::ASTContext&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x47a7b50)
#20 0x0000555b6071fd5d clang::ParseAST(clang::Sema&, bool, bool) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x60b6d5d)
#21 0x0000555b5f4fa865 clang::CodeGenAction::ExecuteAction() (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x4e91865)
#22 0x0000555b5edd6671 clang::FrontendAction::Execute() (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x476d671)
#23 0x0000555b5ed5a993 clang::CompilerInstance::ExecuteAction(clang::FrontendAction&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x46f1993)
#24 0x0000555b5eebab3b clang::ExecuteCompilerInvocation(clang::CompilerInstance*) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x4851b3b)
#25 0x0000555b5b9e341c cc1_main(llvm::ArrayRef<char const*>, char const*, void*) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x137a41c)
#26 0x0000555b5b9debdc ExecuteCC1Tool(llvm::SmallVectorImpl<char const*>&) driver.cpp:0:0
#27 0x0000555b5ebbbce9 void llvm::function_ref<void ()>::callback_fn<clang::driver::CC1Command::Execute(llvm::ArrayRef<std::optional<llvm::StringRef>>, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>*, bool*) const::'lambda'()>(long) Job.cpp:0:0
#28 0x0000555b5e15b977 llvm::CrashRecoveryContext::RunSafely(llvm::function_ref<void ()>) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x3af2977)
#29 0x0000555b5ebbbf1c clang::driver::CC1Command::Execute(llvm::ArrayRef<std::optional<llvm::StringRef>>, std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char>>*, bool*) const (.part.0) Job.cpp:0:0
#30 0x0000555b5eb843fc clang::driver::Compilation::ExecuteCommand(clang::driver::Command const&, clang::driver::Command const*&, bool) const (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x451b3fc)
#31 0x0000555b5eb84ded clang::driver::Compilation::ExecuteJobs(clang::driver::JobList const&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*>>&, bool) const (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x451bded)
#32 0x0000555b5eb8f23c clang::driver::Driver::ExecuteCompilation(clang::driver::Compilation&, llvm::SmallVectorImpl<std::pair<int, clang::driver::Command const*>>&) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x452623c)
#33 0x0000555b5b9e1523 clang_main(int, char**) (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x1378523)
#34 0x00007f2513008d90 (/lib/x86_64-linux-gnu/libc.so.6+0x29d90)
#35 0x00007f2513008e40 __libc_start_main (/lib/x86_64-linux-gnu/libc.so.6+0x29e40)
#36 0x0000555b5b9dad1e _start (/opt/compiler-explorer/clang-16.0.0/bin/clang+++0x1371d1e)
clang-16: error: clang frontend command failed with exit code 134 (use -v to see invocation)
Compiler returned: 134

</details>


CC @pelesh @wsmoses 

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Unable to build with `-O0 -mllvm -enzyme-auto-sparsity=1` #2366

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Unable to build with -O0 -mllvm -enzyme-auto-sparsity=1 #2366

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions

Unable to build with `-O0 -mllvm -enzyme-auto-sparsity=1` #2366