Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 36 additions & 0 deletions c/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@ add_subdirectory(dependencies)
set(BLAKE3_AMD64_NAMES amd64 AMD64 x86_64)
set(BLAKE3_X86_NAMES i686 x86 X86)
set(BLAKE3_ARMv8_NAMES aarch64 AArch64 arm64 ARM64 armv8 armv8a)
set(BLAKE3_LOONG_NAMES loongarch64)
# default SIMD compiler flag configuration (can be overriden by toolchains or CLI)
if(MSVC)
set(BLAKE3_CFLAGS_SSE2 "/arch:SSE2" CACHE STRING "the compiler flags to enable SSE2")
Expand Down Expand Up @@ -76,6 +77,18 @@ elseif(CMAKE_C_COMPILER_ID STREQUAL "GNU"
# 32-bit ARMv8 needs NEON to be enabled explicitly
set(BLAKE3_CFLAGS_NEON "-mfpu=neon" CACHE STRING "the compiler flags to enable NEON")
endif()

if(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_LOONG_NAMES)
include(CheckCCompilerFlag)
check_c_compiler_flag(-mlasx COMPILER_SUPPORTS_LASX)
if(COMPILER_SUPPORTS_LASX)
set(BLAKE3_CFLAGS_LASX "-mlasx" CACHE STRING "the compiler flags to enable lasx")
endif()
check_c_compiler_flag(-mlsx COMPILER_SUPPORTS_LSX)
if(COMPILER_SUPPORTS_LSX)
set(BLAKE3_CFLAGS_LSX "-mlsx" CACHE STRING "the compiler flags to enable lsx")
endif()
endif()
endif()

mark_as_advanced(BLAKE3_CFLAGS_SSE2 BLAKE3_CFLAGS_SSE4.1 BLAKE3_CFLAGS_AVX2 BLAKE3_CFLAGS_AVX512 BLAKE3_CFLAGS_NEON)
Expand Down Expand Up @@ -113,6 +126,10 @@ elseif((CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_ARMv8_NAMES
OR CMAKE_SIZEOF_VOID_P EQUAL 8))
set(BLAKE3_SIMD_TYPE "neon-intrinsics" CACHE STRING "the SIMD acceleration type to use")

elseif(CMAKE_SYSTEM_PROCESSOR IN_LIST BLAKE3_LOONG_NAMES
AND (DEFINED BLAKE3_CFLAGS_LSX OR DEFINED BLAKE3_CFLAGS_LASX))
set(BLAKE3_SIMD_TYPE "loong-intrinsics" CACHE STRING "the SIMD acceleration type to use")

else()
set(BLAKE3_SIMD_TYPE "none" CACHE STRING "the SIMD acceleration type to use")
endif()
Expand Down Expand Up @@ -205,13 +222,31 @@ elseif(BLAKE3_SIMD_TYPE STREQUAL "neon-intrinsics")
set_source_files_properties(blake3_neon.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_NEON}")
endif()

elseif(BLAKE3_SIMD_TYPE STREQUAL "loong-intrinsics")
set(BLAKE3_SIMD_LOONG_INTRINSICS ON)

if (DEFINED BLAKE3_CFLAGS_LSX)
target_sources(blake3 PRIVATE blake3_lsx.c)
set_source_files_properties(blake3_lsx.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_LSX}")
else()
target_compile_definitions(blake3 PRIVATE BLAKE3_NO_LSX)
endif()
if (DEFINED BLAKE3_CFLAGS_LASX)
target_sources(blake3 PRIVATE blake3_lasx.c)
set_source_files_properties(blake3_lasx.c PROPERTIES COMPILE_FLAGS "${BLAKE3_CFLAGS_LASX}")
else()
target_compile_definitions(blake3 PRIVATE BLAKE3_NO_LASX)
endif()

elseif(BLAKE3_SIMD_TYPE STREQUAL "none")
target_compile_definitions(blake3 PRIVATE
BLAKE3_USE_NEON=0
BLAKE3_NO_SSE2
BLAKE3_NO_SSE41
BLAKE3_NO_AVX2
BLAKE3_NO_AVX512
BLAKE3_NO_LSX
BLAKE3_NO_LASX
)

else()
Expand Down Expand Up @@ -365,6 +400,7 @@ install(FILES "${CMAKE_BINARY_DIR}/libblake3.pc"
add_feature_info("AMD64 assembly" BLAKE3_SIMD_AMD64_ASM "The library uses hand written amd64 SIMD assembly.")
add_feature_info("x86 SIMD intrinsics" BLAKE3_SIMD_X86_INTRINSICS "The library uses x86 SIMD intrinsics.")
add_feature_info("NEON SIMD intrinsics" BLAKE3_SIMD_NEON_INTRINSICS "The library uses NEON SIMD intrinsics.")
add_feature_info("LoongArch SIMD intrinsics" BLAKE3_SIMD_LOONG_INTRINSICS "The library uses LoongArch LSX/LASX SIMD intrinsics.")
add_feature_info("oneTBB parallelism" BLAKE3_USE_TBB "The library uses oneTBB parallelism.")
feature_summary(WHAT ENABLED_FEATURES)

Expand Down
43 changes: 43 additions & 0 deletions c/blake3_c_rust_bindings/benches/bench.rs
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ fn bench_single_compression_sse41(b: &mut Bencher) {
}

#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_single_compression_avx512(b: &mut Bencher) {
if !blake3_c_rust_bindings::avx512_detected() {
return;
Expand Down Expand Up @@ -186,6 +187,7 @@ fn bench_many_chunks_avx2(b: &mut Bencher) {
}

#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_chunks_avx512(b: &mut Bencher) {
if !blake3_c_rust_bindings::avx512_detected() {
return;
Expand All @@ -208,6 +210,26 @@ fn bench_many_chunks_neon(b: &mut Bencher) {
);
}

#[bench]
#[cfg(target_arch = "loongarch64")]
fn bench_many_chunks_lasx(b: &mut Bencher) {
bench_many_chunks_fn(
b,
blake3_c_rust_bindings::ffi::loong::blake3_hash_many_lasx,
8,
);
}

#[bench]
#[cfg(target_arch = "loongarch64")]
fn bench_many_chunks_lsx(b: &mut Bencher) {
bench_many_chunks_fn(
b,
blake3_c_rust_bindings::ffi::loong::blake3_hash_many_lsx,
4,
);
}

// TODO: When we get const generics we can unify this with the chunks code.
fn bench_many_parents_fn(b: &mut Bencher, f: HashManyFn, degree: usize) {
let mut inputs = Vec::new();
Expand Down Expand Up @@ -278,6 +300,7 @@ fn bench_many_parents_avx2(b: &mut Bencher) {
}

#[bench]
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
fn bench_many_parents_avx512(b: &mut Bencher) {
if !blake3_c_rust_bindings::avx512_detected() {
return;
Expand All @@ -300,6 +323,26 @@ fn bench_many_parents_neon(b: &mut Bencher) {
);
}

#[bench]
#[cfg(target_arch = "loongarch64")]
fn bench_many_parents_lasx(b: &mut Bencher) {
bench_many_parents_fn(
b,
blake3_c_rust_bindings::ffi::loong::blake3_hash_many_lasx,
8,
);
}

#[bench]
#[cfg(target_arch = "loongarch64")]
fn bench_many_parents_lsx(b: &mut Bencher) {
bench_many_parents_fn(
b,
blake3_c_rust_bindings::ffi::loong::blake3_hash_many_lsx,
4,
);
}

fn bench_incremental(b: &mut Bencher, len: usize) {
let mut input = RandomInput::new(b, len);
b.iter(|| {
Expand Down
15 changes: 15 additions & 0 deletions c/blake3_c_rust_bindings/build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,10 @@ fn is_aarch64() -> bool {
target_components()[0] == "aarch64"
}

fn is_loongarch64() -> bool {
target_components()[0] == "loongarch64"
}

// Windows targets may be using the MSVC toolchain or the GNU toolchain. The
// right compiler flags to use depend on the toolchain. (And we don't want to
// use flag_if_supported, because we don't want features to be silently
Expand Down Expand Up @@ -223,6 +227,17 @@ fn main() -> Result<(), Box<dyn std::error::Error>> {
neon_build.compile("blake3_neon");
}

if defined("CARGO_FEATURE_LOONGARCH") || is_loongarch64() {
let mut lsx_build = new_build();
lsx_build.file(c_dir_path("blake3_lsx.c"));
lsx_build.flag("-mlsx");
lsx_build.compile("blake3_lsx");
let mut lasx_build = new_build();
lasx_build.file(c_dir_path("blake3_lasx.c"));
lasx_build.flag("-mlasx");
lasx_build.compile("blake3_lasx");
}

// The `cc` crate does not automatically emit rerun-if directives for the
// environment variables it supports, in particular for $CC. We expect to
// do a lot of benchmarking across different compilers, so we explicitly
Expand Down
44 changes: 44 additions & 0 deletions c/blake3_c_rust_bindings/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
use std::ffi::{c_void, CString};
use std::mem::MaybeUninit;

#[cfg(target_arch = "loongarch64")]
use std::arch::is_loongarch_feature_detected;

#[cfg(test)]
mod test;

Expand Down Expand Up @@ -35,6 +38,16 @@ pub fn avx512_detected() -> bool {
is_x86_feature_detected!("avx512f") && is_x86_feature_detected!("avx512vl")
}

#[cfg(target_arch = "loongarch64")]
pub fn lsx_detected() -> bool {
is_loongarch_feature_detected!("lsx")
}

#[cfg(target_arch = "loongarch64")]
pub fn lasx_detected() -> bool {
is_loongarch_feature_detected!("lasx")
}

#[derive(Clone)]
pub struct Hasher(ffi::blake3_hasher);

Expand Down Expand Up @@ -330,4 +343,35 @@ pub mod ffi {
);
}
}

#[cfg(target_arch = "loongarch64")]
pub mod loong {
extern "C" {
// lasx low level functions
pub fn blake3_hash_many_lasx(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
pub fn blake3_hash_many_lsx(
inputs: *const *const u8,
num_inputs: usize,
blocks: usize,
key: *const u32,
counter: u64,
increment_counter: bool,
flags: u8,
flags_start: u8,
flags_end: u8,
out: *mut u8,
);
}
}
}
12 changes: 12 additions & 0 deletions c/blake3_c_rust_bindings/src/test.rs
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,18 @@ fn test_hash_many_neon() {
test_hash_many_fn(crate::ffi::neon::blake3_hash_many_neon);
}

#[test]
#[cfg(target_arch = "loongarch64")]
fn test_hash_many_lasx() {
test_hash_many_fn(crate::ffi::loong::blake3_hash_many_lasx);
}

#[test]
#[cfg(target_arch = "loongarch64")]
fn test_hash_many_lsx() {
test_hash_many_fn(crate::ffi::loong::blake3_hash_many_lsx);
}

#[allow(unused)]
type XofManyFunction = unsafe extern "C" fn(
cv: *const u32,
Expand Down
77 changes: 77 additions & 0 deletions c/blake3_dispatch.c
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@

#define MAYBE_UNUSED(x) (void)((x))

#if defined(IS_X86) || defined(IS_LOONGARCH)
#if defined(IS_X86)
static uint64_t xgetbv(void) {
#if defined(_MSC_VER)
Expand Down Expand Up @@ -88,7 +89,42 @@ static void cpuidex(uint32_t out[4], uint32_t id, uint32_t sid) {
: "a"(id), "c"(sid));
#endif
}
#endif

#if defined(IS_LOONGARCH)
#ifdef __linux__
static inline int prctl_auxv(void *buf, size_t bufsz) {
register uint64_t a0 __asm__("a0") = 0x41555856UL; /* PR_GET_AUXV */
register uint64_t a1 __asm__("a1") = (uint64_t)buf;
register uint64_t a2 __asm__("a2") = (uint64_t)bufsz;
register uint64_t a3 __asm__("a3") = 0;
register uint64_t a4 __asm__("a4") = 0;
register uint64_t a7 __asm__("a7") = 167; /* __NR_prctl */
__asm__ __volatile__("syscall 0\n\t"
: "+r"(a0)
: "r"(a7), "r"(a1), "r"(a2), "r"(a3), "r"(a4)
: "memory", "t0", "t1", "t2", "t3", "t4", "t5", "t6", "t7", "t8");
return a0;
}
#endif /* __linux__ */

static uint32_t get_hwcap(void) {
#ifdef __linux__
int bufsz = prctl_auxv(NULL, 0);
if (bufsz < 0)
return 0;
unsigned long *buf = __builtin_alloca(bufsz);
if (prctl_auxv(buf, bufsz) < 0)
return 0;
for (size_t i = 0; i < bufsz / (2 * sizeof(unsigned long)); ++i)
if (buf[2 * i] == 16 /* AT_HWCAP */)
return (uint32_t)buf[2 * i + 1] >> 4;
return 0;
#else
return __builtin_loongarch_cpucfg(2) >> 6;
#endif
}
#endif /* IS_LOONGARCH */

enum cpu_feature {
SSE2 = 1 << 0,
Expand All @@ -98,6 +134,8 @@ enum cpu_feature {
AVX2 = 1 << 4,
AVX512F = 1 << 5,
AVX512VL = 1 << 6,
LSX = 1 << 0,
LASX = 1 << 1,
/* ... */
UNDEFINED = 1 << 30
};
Expand Down Expand Up @@ -157,6 +195,11 @@ static
}
ATOMIC_STORE(g_cpu_features, features);
return features;
#elif defined(IS_LOONGARCH)
uint32_t hwcap = get_hwcap();
features = hwcap & (LSX | LASX);
ATOMIC_STORE(g_cpu_features, features);
return features;
#else
/* How to detect NEON? */
return 0;
Expand Down Expand Up @@ -294,6 +337,26 @@ void blake3_hash_many(const uint8_t *const *inputs, size_t num_inputs,
return;
#endif

#if defined(IS_LOONGARCH)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_LASX)
if (features & LASX) {
blake3_hash_many_lasx(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
return;
}
#endif
#if !defined(BLAKE3_NO_LSX)
if (features & LSX) {
blake3_hash_many_lsx(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end, out);
return;
}
#endif
#endif /* IS_LOONGARCH */

blake3_hash_many_portable(inputs, num_inputs, blocks, key, counter,
increment_counter, flags, flags_start, flags_end,
out);
Expand Down Expand Up @@ -328,5 +391,19 @@ size_t blake3_simd_degree(void) {
#if BLAKE3_USE_NEON == 1
return 4;
#endif
#if defined(IS_LOONGARCH)
const enum cpu_feature features = get_cpu_features();
MAYBE_UNUSED(features);
#if !defined(BLAKE3_NO_LASX)
if (features & LASX) {
return 8;
}
#endif
#if !defined(BLAKE3_NO_LSX)
if (features & LSX) {
return 4;
}
#endif
#endif /* IS_LOONGARCH */
return 1;
}
Loading
Loading