New recipe: High Performance Tensor Transposition library (v1.0.5) (#4765)

Jutho · web-flow · commit 6f25ed2da36b · 2022-04-15T17:27:38.000+01:00
High Performance Tensor Transposition Library (v1.0.5)
diff --git a/H/HPTT/build_tarballs.jl b/H/HPTT/build_tarballs.jl
@@ -0,0 +1,52 @@
+using BinaryBuilder, Pkg
+
+name = "HPTT"
+version = v"1.0.5"
+
+# Collection of sources required to complete build
+sources = [
+    GitSource("https://github.com/springer13/hptt.git", "a55c2a927d5462e81abeb12081fd345024caf5f6"),
+    DirectorySource("./bundled"),
+]
+
+# Bash recipe for building across all platforms
+script = raw"""
+atomic_patch -p1 $WORKSPACE/srcdir/patches/clang_compatibility.patch
+mkdir -p ${libdir}
+mkdir -p ${includedir}
+export hpttdir=${WORKSPACE}/srcdir/hptt
+cp ${hpttdir}/include/* ${includedir}
+export CXXFLAGS="-O3 -std=c++11 -DNDEBUG -fopenmp -fPIC"
+if [[ ${proc_family} == intel ]]; then
+    export CXXFLAGS="$CXXFLAGS -mavx -DHPTT_ARCH_AVX"
+elif [[ ${proc_family} == power ]]; then
+    export CXXFLAGS="$CXXFLAGS -DHPTT_ARCH_IBM -maltivec -mabi=altivec";
+## specific arm optimizations seem to be broken in library
+# elif [[ ${target} == arm* ]]; then
+#     export CXXFLAGS="$CXXFLAGS -mfpu=neon -DHPTT_ARCH_ARM"
+fi
+for f in ${hpttdir}/src/*.cpp; do
+    $CXX $CXXFLAGS -I ${includedir} -c $f -o ${f%.cpp}.o
+done
+$CXX ${hpttdir}/src/*.o $CXXFLAGS -o ${libdir}/libhptt.$dlext -shared
+install_license ${hpttdir}/LICENSE.txt
+"""
+
+# These are the platforms we will build for by default, unless further
+# platforms are passed in on the command line
+platforms = supported_platforms()
+platforms = expand_cxxstring_abis(platforms)
+
+# The products that we will ensure are always built
+products = [
+    LibraryProduct("libhptt", :libhptt),
+]
+
+# Dependencies that must be installed before this package can be built
+dependencies = [
+    Dependency(PackageSpec(name="CompilerSupportLibraries_jll", uuid="e66e0078-7015-5450-92f7-15fbd957f2ae"); platforms=filter(!Sys.isbsd, platforms))
+    Dependency(PackageSpec(name="LLVMOpenMP_jll", uuid="1d63c593-3942-5779-bab2-d838dc0a180e"); platforms=filter(Sys.isbsd, platforms))
+]
+
+# Build the tarballs, and possibly a `build.jl` as well.
+build_tarballs(ARGS, name, version, sources, script, platforms, products, dependencies; preferred_gcc_version = v"5.2.0", julia_compat="1.6")
diff --git a/H/HPTT/bundled/patches/clang_compatibility.patch b/H/HPTT/bundled/patches/clang_compatibility.patch
@@ -0,0 +1,40 @@
+diff --git before/hptt/include/hptt_types.h after/hptt/include/hptt_types.h
+index 170288e..ebc5796 100644
+--- before/hptt/include/hptt_types.h
++++ after/hptt/include/hptt_types.h
+@@ -1,7 +1,6 @@
+ #pragma once
+ 
+ #include <complex>
+-#include <complex.h>
+ 
+ #define REGISTER_BITS 256 // AVX
+ #ifdef HPTT_ARCH_ARM
+diff --git before/hptt/src/hptt.cpp after/hptt/src/hptt.cpp
+index 82d4e73..3018664 100644
+--- before/hptt/src/hptt.cpp
++++ after/hptt/src/hptt.cpp
+@@ -180,8 +180,10 @@ void cTensorTranspose( const int *perm, const int dim,
+                  const float _Complex beta,        float _Complex *B,                   const int *outerSizeB, 
+                  const int numThreads, const int useRowMajor)
+ {
++   const hptt::FloatComplex* calpha = reinterpret_cast<const hptt::FloatComplex*>(&alpha);
++   const hptt::FloatComplex* cbeta = reinterpret_cast<const hptt::FloatComplex*>(&beta);
+    auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, 
+-                         (const hptt::FloatComplex*) A, (hptt::FloatComplex) alpha, (hptt::FloatComplex*) B, (hptt::FloatComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
++                         (const hptt::FloatComplex*) A, *calpha, (hptt::FloatComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+    plan->setConjA(conjA);
+    plan->execute();
+ }
+@@ -191,8 +193,10 @@ void zTensorTranspose( const int *perm, const int dim,
+                  const double _Complex beta,        double _Complex *B,                   const int *outerSizeB, 
+                  const int numThreads, const int useRowMajor)
+ {
++   const hptt::DoubleComplex* calpha = reinterpret_cast<const hptt::DoubleComplex*>(&alpha);
++   const hptt::DoubleComplex* cbeta = reinterpret_cast<const hptt::DoubleComplex*>(&beta);
+    auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim, 
+-                         (const hptt::DoubleComplex*) A, (hptt::DoubleComplex) alpha, (hptt::DoubleComplex*) B, (hptt::DoubleComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
++                         (const hptt::DoubleComplex*) A, *calpha, (hptt::DoubleComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
+    plan->setConjA(conjA);
+    plan->execute();
+ }