Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 39 additions & 0 deletions .github/workflows/build.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
name: Build and Lint on Multiple Platforms

on:
push:
branches:
- main
pull_request:
branches:
- main

jobs:
build:
runs-on: ${{ matrix.os }}
strategy:
fail-fast: false
matrix:
os: [ubuntu-latest, windows-latest, macOS-latest]
steps:
- uses: actions/checkout@v4
- uses: dtolnay/rust-toolchain@nightly
- run: rustup component add clippy
- uses: Swatinem/rust-cache@v2
with:
# To only cache runs from `master`:
save-if: ${{ github.ref == 'refs/heads/master' }}
- uses: Jimver/[email protected]
id: cuda-toolkit
with:
cuda: '12.4.1'
method: 'network'
sub-packages: '[ "nvcc", "cudart", "nvrtc" ]'
if: matrix.os != 'macOS-latest'

- name: Clippy Check
run: cargo clippy -- -D warnings
continue-on-error: true

- name: Build
run: cargo build
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ check [Bend](https://github.com/HigherOrderCO/Bend) instead.
Usage
-----

> DISCLAIMER: Windows is currently not supported, please use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install) for now as a workaround.
> For native Windows build, you will need at least Visual Studio 2022 version 17.5 for [C11 Atomics](https://devblogs.microsoft.com/cppblog/c11-atomics-in-visual-studio-2022-version-17-5-preview-2/) support.

First install the dependencies:
* If you want to use the C runtime, install a C-11 compatible compiler like GCC or Clang.
Expand Down
9 changes: 8 additions & 1 deletion build.rs
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,14 @@ fn main() {
println!("cargo:rerun-if-changed=src/hvm.c");
println!("cargo:rerun-if-changed=src/hvm.cu");

match cc::Build::new()
let mut build = cc::Build::new();

if cfg!(target_env = "msvc") {
build.flag("/experimental:c11atomics");
build.std("c11");
}

match build
.file("src/hvm.c")
.opt_level(3)
.warnings(false)
Expand Down
51 changes: 40 additions & 11 deletions src/hvm.c
Original file line number Diff line number Diff line change
@@ -1,6 +1,18 @@
#include <inttypes.h>
#include <math.h>
#ifdef _WIN32
#include <windows.h>
#include <threads.h>
#else
#include <pthread.h>
#include <time.h>
typedef pthread_t thrd_t;
typedef void* thrd_start_t;
#define thrd_create(a, b, c) pthread_create(a, NULL, b, c)
#define thrd_join(a, b) pthread_join(a, b)
#define thrd_yield() sched_yield()
#define thrd_sleep(a, b) nanosleep(a, b)
#endif
#include <stdatomic.h>
#include <stdint.h>
#include <stdio.h>
Expand Down Expand Up @@ -113,9 +125,9 @@ typedef u32 Numb; // Numb ::= 29-bit (rounded up to u32)
#define G_RBAG_LEN (TPC * RLEN)

typedef struct Net {
APair node_buf[G_NODE_LEN]; // global node buffer
APort vars_buf[G_VARS_LEN]; // global vars buffer
APair rbag_buf[G_RBAG_LEN]; // global rbag buffer
APair* node_buf; // global node buffer, size = G_NODE_LEN
APort* vars_buf; // global vars buffer, size = G_VARS_LEN
APair* rbag_buf; // global rbag buffer, size = G_RBAG_LEN
a64 itrs; // interaction count
a32 idle; // idle thread counter
} Net;
Expand Down Expand Up @@ -268,9 +280,11 @@ static inline void swap(Port *a, Port *b) {
Port x = *a; *a = *b; *b = x;
}

#ifndef _WIN32
u32 min(u32 a, u32 b) {
return (a < b) ? a : b;
}
#endif

// A simple spin-wait barrier using atomic operations
a64 a_reached = 0; // number of threads that reached the current barrier
Expand All @@ -284,7 +298,7 @@ void sync_threads() {
} else {
u32 tries = 0;
while (atomic_load_explicit(&a_barrier, memory_order_acquire) == barrier_old) {
sched_yield();
thrd_yield();
}
}
}
Expand All @@ -301,10 +315,20 @@ u32 global_sum(u32 x) {
}

// TODO: write a time64() function that returns the time as fast as possible as a u64
// The time should be in nanoseconds, but not related to UTC time
static inline u64 time64() {
#ifndef _WIN32
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (u64)ts.tv_sec * 1000000000ULL + (u64)ts.tv_nsec;
#else
// @developedby: We dont care about system time, this is just a timer.
LARGE_INTEGER freq;
LARGE_INTEGER counter;
QueryPerformanceFrequency(&freq);
QueryPerformanceCounter(&counter);
return (u64)((counter.QuadPart * 1000000000ULL) / freq.QuadPart);
#endif
}

// Ports / Pairs / Rules
Expand Down Expand Up @@ -645,6 +669,11 @@ static inline void net_init(Net* net) {
// is that needed?
atomic_store(&net->itrs, 0);
atomic_store(&net->idle, 0);

// allocates global buffers
net->node_buf = malloc(G_NODE_LEN * sizeof(APair));
net->vars_buf = malloc(G_VARS_LEN * sizeof(APort));
net->rbag_buf = malloc(G_RBAG_LEN * sizeof(APair));
}

// Allocator
Expand Down Expand Up @@ -1133,7 +1162,7 @@ void evaluator(Net* net, TM* tm, Book* book) {
//if (stolen) {
//push_redex(net, tm, trg);
//} else {
//// do nothing: will sched_yield
//// do nothing: will thrd_yield
//}
//// If we see a non-stealable redex, try the next one
//} else {
Expand All @@ -1153,7 +1182,7 @@ void evaluator(Net* net, TM* tm, Book* book) {
}

// Chill...
sched_yield();
thrd_yield();
// Halt if all threads are idle
if (tick % 256 == 0) {
if (atomic_load_explicit(&net->idle, memory_order_relaxed) == TPC) {
Expand All @@ -1179,7 +1208,7 @@ typedef struct {
Book* book;
} ThreadArg;

void* thread_func(void* arg) {
thrd_start_t thread_func(void* arg) {
ThreadArg* data = (ThreadArg*)arg;
evaluator(data->net, data->tm, data->book);
return NULL;
Expand All @@ -1203,14 +1232,14 @@ void normalize(Net* net, Book* book) {
}

// Spawns the evaluation threads
pthread_t threads[TPC];
thrd_t threads[TPC];
for (u32 t = 0; t < TPC; ++t) {
pthread_create(&threads[t], NULL, thread_func, &thread_arg[t]);
thrd_create(&threads[t], thread_func, &thread_arg[t]);
}

// Wait for the threads to finish
for (u32 t = 0; t < TPC; ++t) {
pthread_join(threads[t], NULL);
thrd_join(threads[t], NULL);
}
}

Expand Down Expand Up @@ -1532,7 +1561,7 @@ Port io_sleep(Net* net, Book* book, u32 argc, Port* argv) {
struct timespec ts;
ts.tv_sec = dur_ns / 1000000000;
ts.tv_nsec = dur_ns % 1000000000;
nanosleep(&ts, NULL);
thrd_sleep(&ts, NULL);

// Return an eraser
return new_port(ERA, 0);
Expand Down
41 changes: 30 additions & 11 deletions src/hvm.cu
Original file line number Diff line number Diff line change
Expand Up @@ -346,13 +346,13 @@ const u32 G_RBAG_LEN = TPB * BPG * RLEN * 3; // max 4m redexes
struct GNet {
u32 rbag_use_A; // total rbag redex count (buffer A)
u32 rbag_use_B; // total rbag redex count (buffer B)
Pair rbag_buf_A[G_RBAG_LEN]; // global redex bag (buffer A)
Pair rbag_buf_B[G_RBAG_LEN]; // global redex bag (buffer B)
Pair node_buf[G_NODE_LEN]; // global node buffer
Port vars_buf[G_VARS_LEN]; // global vars buffer
u32 node_put[TPB*BPG];
u32 vars_put[TPB*BPG];
u32 rbag_pos[TPB*BPG];
Pair* rbag_buf_A; // global redex bag (buffer A), size = G_RBAG_LEN
Pair* rbag_buf_B; // global redex bag (buffer B), size = G_RBAG_LEN
Pair* node_buf; // global node buffer, size = G_NODE_LEN
Port* vars_buf; // global vars buffer, size = G_VARS_LEN
u32* node_put; // size = TPB*BPG
u32* vars_put; // size = TPB*BPG
u32* rbag_pos; // size = TPB*BPG
u8 mode; // evaluation mode (curr)
u64 itrs; // interaction count
u64 iadd; // interaction count adder
Expand Down Expand Up @@ -1895,10 +1895,29 @@ __global__ void evaluator(GNet* gnet) {
// -------------------

GNet* gnet_create() {
GNet *gnet;
cudaMalloc((void**)&gnet, sizeof(GNet));
cudaMemset(gnet, 0, sizeof(GNet));
return gnet;
GNet gnet;
memset(&gnet, 0, sizeof(GNet));

#define ALLOCATE_HOST_POINTER(__host_pointer, __size) \
do { \
cudaMalloc((void**)&(__host_pointer), __size); \
cudaMemset(__host_pointer, 0, __size); \
} while(0)

ALLOCATE_HOST_POINTER(gnet.rbag_buf_A, G_RBAG_LEN * sizeof(Pair));
ALLOCATE_HOST_POINTER(gnet.rbag_buf_B, G_RBAG_LEN * sizeof(Pair));
ALLOCATE_HOST_POINTER(gnet.node_buf, G_NODE_LEN * sizeof(Pair));
ALLOCATE_HOST_POINTER(gnet.vars_buf, G_VARS_LEN * sizeof(Port));
ALLOCATE_HOST_POINTER(gnet.node_put, BPG * TPB * sizeof(u32));
ALLOCATE_HOST_POINTER(gnet.vars_put, BPG * TPB * sizeof(u32));
ALLOCATE_HOST_POINTER(gnet.rbag_pos, BPG * TPB * sizeof(u32));

#undef ALLOCATE_HOST_POINTER

GNet* gnet_d;
cudaMalloc(&gnet_d, sizeof(GNet));
cudaMemcpy(gnet_d, &gnet, sizeof(GNet), cudaMemcpyHostToDevice);
return gnet_d;
}

u32 gnet_get_rlen(GNet* gnet, u32 turn) {
Expand Down