Skip to content

Commit 6f25ed2

Browse files
authored
New recipe: High Performance Tensor Transposition library (v1.0.5) (#4765)
High Performance Tensor Transposition Library (v1.0.5)
1 parent 9f789e6 commit 6f25ed2

File tree

2 files changed

+92
-0
lines changed

2 files changed

+92
-0
lines changed

H/HPTT/build_tarballs.jl

Lines changed: 52 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
using BinaryBuilder, Pkg
2+
3+
name = "HPTT"
4+
version = v"1.0.5"
5+
6+
# Collection of sources required to complete build
7+
sources = [
8+
GitSource("https://github.com/springer13/hptt.git", "a55c2a927d5462e81abeb12081fd345024caf5f6"),
9+
DirectorySource("./bundled"),
10+
]
11+
12+
# Bash recipe for building across all platforms
13+
script = raw"""
14+
atomic_patch -p1 $WORKSPACE/srcdir/patches/clang_compatibility.patch
15+
mkdir -p ${libdir}
16+
mkdir -p ${includedir}
17+
export hpttdir=${WORKSPACE}/srcdir/hptt
18+
cp ${hpttdir}/include/* ${includedir}
19+
export CXXFLAGS="-O3 -std=c++11 -DNDEBUG -fopenmp -fPIC"
20+
if [[ ${proc_family} == intel ]]; then
21+
export CXXFLAGS="$CXXFLAGS -mavx -DHPTT_ARCH_AVX"
22+
elif [[ ${proc_family} == power ]]; then
23+
export CXXFLAGS="$CXXFLAGS -DHPTT_ARCH_IBM -maltivec -mabi=altivec";
24+
## specific arm optimizations seem to be broken in library
25+
# elif [[ ${target} == arm* ]]; then
26+
# export CXXFLAGS="$CXXFLAGS -mfpu=neon -DHPTT_ARCH_ARM"
27+
fi
28+
for f in ${hpttdir}/src/*.cpp; do
29+
$CXX $CXXFLAGS -I ${includedir} -c $f -o ${f%.cpp}.o
30+
done
31+
$CXX ${hpttdir}/src/*.o $CXXFLAGS -o ${libdir}/libhptt.$dlext -shared
32+
install_license ${hpttdir}/LICENSE.txt
33+
"""
34+
35+
# These are the platforms we will build for by default, unless further
36+
# platforms are passed in on the command line
37+
platforms = supported_platforms()
38+
platforms = expand_cxxstring_abis(platforms)
39+
40+
# The products that we will ensure are always built
41+
products = [
42+
LibraryProduct("libhptt", :libhptt),
43+
]
44+
45+
# Dependencies that must be installed before this package can be built
46+
dependencies = [
47+
Dependency(PackageSpec(name="CompilerSupportLibraries_jll", uuid="e66e0078-7015-5450-92f7-15fbd957f2ae"); platforms=filter(!Sys.isbsd, platforms))
48+
Dependency(PackageSpec(name="LLVMOpenMP_jll", uuid="1d63c593-3942-5779-bab2-d838dc0a180e"); platforms=filter(Sys.isbsd, platforms))
49+
]
50+
51+
# Build the tarballs, and possibly a `build.jl` as well.
52+
build_tarballs(ARGS, name, version, sources, script, platforms, products, dependencies; preferred_gcc_version = v"5.2.0", julia_compat="1.6")
Lines changed: 40 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,40 @@
1+
diff --git before/hptt/include/hptt_types.h after/hptt/include/hptt_types.h
2+
index 170288e..ebc5796 100644
3+
--- before/hptt/include/hptt_types.h
4+
+++ after/hptt/include/hptt_types.h
5+
@@ -1,7 +1,6 @@
6+
#pragma once
7+
8+
#include <complex>
9+
-#include <complex.h>
10+
11+
#define REGISTER_BITS 256 // AVX
12+
#ifdef HPTT_ARCH_ARM
13+
diff --git before/hptt/src/hptt.cpp after/hptt/src/hptt.cpp
14+
index 82d4e73..3018664 100644
15+
--- before/hptt/src/hptt.cpp
16+
+++ after/hptt/src/hptt.cpp
17+
@@ -180,8 +180,10 @@ void cTensorTranspose( const int *perm, const int dim,
18+
const float _Complex beta, float _Complex *B, const int *outerSizeB,
19+
const int numThreads, const int useRowMajor)
20+
{
21+
+ const hptt::FloatComplex* calpha = reinterpret_cast<const hptt::FloatComplex*>(&alpha);
22+
+ const hptt::FloatComplex* cbeta = reinterpret_cast<const hptt::FloatComplex*>(&beta);
23+
auto plan(std::make_shared<hptt::Transpose<hptt::FloatComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim,
24+
- (const hptt::FloatComplex*) A, (hptt::FloatComplex) alpha, (hptt::FloatComplex*) B, (hptt::FloatComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
25+
+ (const hptt::FloatComplex*) A, *calpha, (hptt::FloatComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
26+
plan->setConjA(conjA);
27+
plan->execute();
28+
}
29+
@@ -191,8 +193,10 @@ void zTensorTranspose( const int *perm, const int dim,
30+
const double _Complex beta, double _Complex *B, const int *outerSizeB,
31+
const int numThreads, const int useRowMajor)
32+
{
33+
+ const hptt::DoubleComplex* calpha = reinterpret_cast<const hptt::DoubleComplex*>(&alpha);
34+
+ const hptt::DoubleComplex* cbeta = reinterpret_cast<const hptt::DoubleComplex*>(&beta);
35+
auto plan(std::make_shared<hptt::Transpose<hptt::DoubleComplex> >(sizeA, perm, outerSizeA, outerSizeB, dim,
36+
- (const hptt::DoubleComplex*) A, (hptt::DoubleComplex) alpha, (hptt::DoubleComplex*) B, (hptt::DoubleComplex) beta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
37+
+ (const hptt::DoubleComplex*) A, *calpha, (hptt::DoubleComplex*) B, *cbeta, hptt::ESTIMATE, numThreads, nullptr, useRowMajor));
38+
plan->setConjA(conjA);
39+
plan->execute();
40+
}

0 commit comments

Comments
 (0)