From 43cd0f79078f3df3db2ba0e936f806dd95abc229 Mon Sep 17 00:00:00 2001
From: Sepcnt <30561671+sepcnt@users.noreply.github.com>
Date: Sun, 19 May 2024 16:54:19 +0000
Subject: [PATCH 1/5] Add support for MSVC and GitHub Actions

---
 .github/workflows/build.yml | 36 ++++++++++++++++++++++++++++++
 build.rs                    |  9 +++++++-
 src/hvm.c                   | 44 +++++++++++++++++++++++++++----------
 src/hvm.cu                  | 41 ++++++++++++++++++++++++----------
 4 files changed, 107 insertions(+), 23 deletions(-)
 create mode 100644 .github/workflows/build.yml

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
new file mode 100644
index 00000000..b1778b18
--- /dev/null
+++ b/.github/workflows/build.yml
@@ -0,0 +1,36 @@
+name: Build and Lint on Multiple Platforms
+
+on:
+  push:
+    branches:
+      - main
+  pull_request:
+    branches:
+      - main
+
+jobs:
+  build:
+    runs-on: ${{ matrix.os }}
+    strategy:
+      fail-fast: false
+      matrix:
+        os: [ubuntu-latest, windows-latest, macOS-latest]
+    steps:
+      - uses: actions/checkout@v4
+      - uses: dtolnay/rust-toolchain@nightly
+      - uses: Swatinem/rust-cache@v2
+        with:
+          # To only cache runs from `master`:
+          save-if: ${{ github.ref == 'refs/heads/master' }}
+      - uses: Jimver/cuda-toolkit@v0.2.15
+        id: cuda-toolkit
+        with:
+          cuda: '12.4.1'
+        if: matrix.os != 'macOS-latest'
+
+      - name: Clippy Check
+        run: cargo clippy --all-features -- -D warnings
+        continue-on-error: true
+
+      - name: Build
+        run: cargo build --all-features
diff --git a/build.rs b/build.rs
index 5499211e..90dfe86b 100644
--- a/build.rs
+++ b/build.rs
@@ -6,9 +6,16 @@ fn main() {
   println!("cargo:rerun-if-changed=src/hvm.c");
   println!("cargo:rerun-if-changed=src/hvm.cu");
 
-  match cc::Build::new()
+  let mut build = cc::Build::new();
+
+  if cfg!(target_env = "msvc") {
+    build.flag("/experimental:c11atomics");
+  }
+
+  match build
       .file("src/hvm.c")
       .opt_level(3)
+      .std("c11")
       .warnings(false)
       .define("TPC_L2", &*tpcl2.to_string())
       .try_compile("hvm-c") {
diff --git a/src/hvm.c b/src/hvm.c
index 44790811..c857359f 100644
--- a/src/hvm.c
+++ b/src/hvm.c
@@ -1,6 +1,11 @@
 #include <inttypes.h>
 #include <math.h>
+#include <threads.h>
+#ifdef _WIN32
+#include <windows.h>
+#else
 #include <pthread.h>
+#endif
 #include <stdatomic.h>
 #include <stdint.h>
 #include <stdio.h>
@@ -113,9 +118,9 @@ typedef u32 Numb; // Numb ::= 29-bit (rounded up to u32)
 #define G_RBAG_LEN (TPC * RLEN)
 
 typedef struct Net {
-  APair node_buf[G_NODE_LEN]; // global node buffer
-  APort vars_buf[G_VARS_LEN]; // global vars buffer
-  APair rbag_buf[G_RBAG_LEN]; // global rbag buffer
+  APair* node_buf; // global node buffer, size = G_NODE_LEN
+  APort* vars_buf; // global vars buffer, size = G_VARS_LEN
+  APair* rbag_buf; // global rbag buffer, size = G_RBAG_LEN
   a64 itrs; // interaction count
   a32 idle; // idle thread counter
 } Net;
@@ -268,9 +273,11 @@ static inline void swap(Port *a, Port *b) {
   Port x = *a; *a = *b; *b = x;
 }
 
+#ifndef _WIN32
 u32 min(u32 a, u32 b) {
   return (a < b) ? a : b;
 }
+#endif
 
 // A simple spin-wait barrier using atomic operations
 a64 a_reached = 0; // number of threads that reached the current barrier
@@ -284,7 +291,7 @@ void sync_threads() {
   } else {
     u32 tries = 0;
     while (atomic_load_explicit(&a_barrier, memory_order_acquire) == barrier_old) {
-      sched_yield();
+      thrd_yield();
     }
   }
 }
@@ -301,10 +308,20 @@ u32 global_sum(u32 x) {
 }
 
 // TODO: write a time64() function that returns the time as fast as possible as a u64
+// The time should be in nanoseconds, but not related to UTC time
 static inline u64 time64() {
+#ifndef _WIN32
   struct timespec ts;
   clock_gettime(CLOCK_MONOTONIC, &ts);
   return (u64)ts.tv_sec * 1000000000ULL + (u64)ts.tv_nsec;
+#else
+  // @developedby: We dont care about system time, this is just a timer.
+  LARGE_INTEGER freq;
+  LARGE_INTEGER counter;
+  QueryPerformanceFrequency(&freq);
+  QueryPerformanceCounter(&counter);
+  return (u64)((counter.QuadPart * 1000000000ULL) / freq.QuadPart);
+#endif
 }
 
 // Ports / Pairs / Rules
@@ -645,6 +662,11 @@ static inline void net_init(Net* net) {
   // is that needed?
   atomic_store(&net->itrs, 0);
   atomic_store(&net->idle, 0);
+
+  // allocates global buffers
+  net->node_buf = malloc(G_NODE_LEN * sizeof(APair));
+  net->vars_buf = malloc(G_VARS_LEN * sizeof(APort));
+  net->rbag_buf = malloc(G_RBAG_LEN * sizeof(APair));
 }
 
 // Allocator
@@ -1133,7 +1155,7 @@ void evaluator(Net* net, TM* tm, Book* book) {
         //if (stolen) {
           //push_redex(net, tm, trg);
         //} else {
-          //// do nothing: will sched_yield
+          //// do nothing: will thrd_yield
         //}
       //// If we see a non-stealable redex, try the next one
       //} else {
@@ -1153,7 +1175,7 @@ void evaluator(Net* net, TM* tm, Book* book) {
       }
       
       // Chill...
-      sched_yield();
+      thrd_yield();
       // Halt if all threads are idle
       if (tick % 256 == 0) {
         if (atomic_load_explicit(&net->idle, memory_order_relaxed) == TPC) {
@@ -1179,7 +1201,7 @@ typedef struct {
   Book* book;
 } ThreadArg;
 
-void* thread_func(void* arg) {
+thrd_start_t thread_func(void* arg) {
   ThreadArg* data = (ThreadArg*)arg;
   evaluator(data->net, data->tm, data->book);
   return NULL;
@@ -1203,14 +1225,14 @@ void normalize(Net* net, Book* book) {
   }
 
   // Spawns the evaluation threads
-  pthread_t threads[TPC];
+  thrd_t threads[TPC];
   for (u32 t = 0; t < TPC; ++t) {
-    pthread_create(&threads[t], NULL, thread_func, &thread_arg[t]);
+    thrd_create(&threads[t], thread_func, &thread_arg[t]);
   }
 
   // Wait for the threads to finish
   for (u32 t = 0; t < TPC; ++t) {
-    pthread_join(threads[t], NULL);
+    thrd_join(threads[t], NULL);
   }
 }
 
@@ -1532,7 +1554,7 @@ Port io_sleep(Net* net, Book* book, u32 argc, Port* argv) {
   struct timespec ts;
   ts.tv_sec = dur_ns / 1000000000;
   ts.tv_nsec = dur_ns % 1000000000;
-  nanosleep(&ts, NULL);
+  thrd_sleep(&ts, NULL);
 
   // Return an eraser
   return new_port(ERA, 0);
diff --git a/src/hvm.cu b/src/hvm.cu
index 4fb3aa1d..0e39e718 100644
--- a/src/hvm.cu
+++ b/src/hvm.cu
@@ -346,13 +346,13 @@ const u32 G_RBAG_LEN = TPB * BPG * RLEN * 3; // max 4m redexes
 struct GNet {
   u32  rbag_use_A; // total rbag redex count (buffer A)
   u32  rbag_use_B; // total rbag redex count (buffer B)
-  Pair rbag_buf_A[G_RBAG_LEN]; // global redex bag (buffer A)
-  Pair rbag_buf_B[G_RBAG_LEN]; // global redex bag (buffer B)
-  Pair node_buf[G_NODE_LEN]; // global node buffer
-  Port vars_buf[G_VARS_LEN]; // global vars buffer
-  u32  node_put[TPB*BPG];
-  u32  vars_put[TPB*BPG];
-  u32  rbag_pos[TPB*BPG];
+  Pair* rbag_buf_A; // global redex bag (buffer A), size = G_RBAG_LEN
+  Pair* rbag_buf_B; // global redex bag (buffer B), size = G_RBAG_LEN
+  Pair* node_buf; // global node buffer, size = G_NODE_LEN
+  Port* vars_buf; // global vars buffer, size = G_VARS_LEN
+  u32*  node_put; // size = TPB*BPG
+  u32*  vars_put; // size = TPB*BPG
+  u32*  rbag_pos; // size = TPB*BPG
   u8   mode; // evaluation mode (curr)
   u64  itrs; // interaction count
   u64  iadd; // interaction count adder
@@ -1895,10 +1895,29 @@ __global__ void evaluator(GNet* gnet) {
 // -------------------
 
 GNet* gnet_create() {
-  GNet *gnet;
-  cudaMalloc((void**)&gnet, sizeof(GNet));
-  cudaMemset(gnet, 0, sizeof(GNet));
-  return gnet;
+  GNet gnet;
+  memset(&gnet, 0, sizeof(GNet));
+
+  #define ALLOCATE_HOST_POINTER(__host_pointer, __size) \
+    do { \
+      cudaMalloc((void**)&(__host_pointer), __size); \
+      cudaMemset(__host_pointer, 0, __size); \
+    } while(0)
+
+  ALLOCATE_HOST_POINTER(gnet.rbag_buf_A, G_RBAG_LEN * sizeof(Pair));
+  ALLOCATE_HOST_POINTER(gnet.rbag_buf_B, G_RBAG_LEN * sizeof(Pair));
+  ALLOCATE_HOST_POINTER(gnet.node_buf, G_NODE_LEN * sizeof(Pair));
+  ALLOCATE_HOST_POINTER(gnet.vars_buf, G_VARS_LEN * sizeof(Port));
+  ALLOCATE_HOST_POINTER(gnet.node_put, BPG * TPB * sizeof(u32));
+  ALLOCATE_HOST_POINTER(gnet.vars_put, BPG * TPB * sizeof(u32));
+  ALLOCATE_HOST_POINTER(gnet.rbag_pos, BPG * TPB * sizeof(u32));
+
+  #undef ALLOCATE_HOST_POINTER
+
+  GNet* gnet_d;
+  cudaMalloc(&gnet_d, sizeof(GNet));
+  cudaMemcpy(gnet_d, &gnet, sizeof(GNet), cudaMemcpyHostToDevice);
+  return gnet_d;
 }
 
 u32 gnet_get_rlen(GNet* gnet, u32 turn) {

From 0d51264d2a0a09bb7bd53d161b6809dc666afd03 Mon Sep 17 00:00:00 2001
From: Sepcnt <30561671+sepcnt@users.noreply.github.com>
Date: Sun, 19 May 2024 17:17:21 +0000
Subject: [PATCH 2/5] Polyfill for thread.h and fix disk issue of Linux CI

---
 .github/workflows/build.yml |  2 ++
 src/hvm.c                   | 10 ++++++++--
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b1778b18..d40d1f25 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -26,6 +26,8 @@ jobs:
         id: cuda-toolkit
         with:
           cuda: '12.4.1'
+          method: 'network'
+          sub-packages: '["nvcc"]'
         if: matrix.os != 'macOS-latest'
 
       - name: Clippy Check
diff --git a/src/hvm.c b/src/hvm.c
index c857359f..897fbcb4 100644
--- a/src/hvm.c
+++ b/src/hvm.c
@@ -1,10 +1,16 @@
 #include <inttypes.h>
 #include <math.h>
-#include <threads.h>
 #ifdef _WIN32
 #include <windows.h>
+#include <threads.h>
 #else
 #include <pthread.h>
+typedef pthread_t thrd_t;
+typedef void* thrd_start_t;
+#define thrd_create(a, b, c) pthread_create(a, NULL, b, c)
+#define thrd_join(a, b) pthread_join(a, b)
+#define thrd_yield() sched_yield()
+#define thrd_sleep(a, b) nanosleep(a, b)
 #endif
 #include <stdatomic.h>
 #include <stdint.h>
@@ -662,7 +668,7 @@ static inline void net_init(Net* net) {
   // is that needed?
   atomic_store(&net->itrs, 0);
   atomic_store(&net->idle, 0);
-
+  
   // allocates global buffers
   net->node_buf = malloc(G_NODE_LEN * sizeof(APair));
   net->vars_buf = malloc(G_VARS_LEN * sizeof(APort));

From 9a99d38501156930afc295f84313be288e5b7ae6 Mon Sep 17 00:00:00 2001
From: Sepcnt <30561671+sepcnt@users.noreply.github.com>
Date: Sun, 19 May 2024 17:33:24 +0000
Subject: [PATCH 3/5] Fix CI bugs

---
 .github/workflows/build.yml | 2 +-
 src/hvm.c                   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index d40d1f25..de854938 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -18,6 +18,7 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: dtolnay/rust-toolchain@nightly
+      - run: rustup component add clippy
       - uses: Swatinem/rust-cache@v2
         with:
           # To only cache runs from `master`:
@@ -27,7 +28,6 @@ jobs:
         with:
           cuda: '12.4.1'
           method: 'network'
-          sub-packages: '["nvcc"]'
         if: matrix.os != 'macOS-latest'
 
       - name: Clippy Check
diff --git a/src/hvm.c b/src/hvm.c
index 897fbcb4..e5dbea01 100644
--- a/src/hvm.c
+++ b/src/hvm.c
@@ -5,6 +5,7 @@
 #include <threads.h>
 #else
 #include <pthread.h>
+#include <time.h>
 typedef pthread_t thrd_t;
 typedef void* thrd_start_t;
 #define thrd_create(a, b, c) pthread_create(a, NULL, b, c)

From 8f15382a0192f0e6465bc7d9238e5e82d23c3091 Mon Sep 17 00:00:00 2001
From: Sepcnt <30561671+sepcnt@users.noreply.github.com>
Date: Mon, 20 May 2024 02:51:54 +0000
Subject: [PATCH 4/5] Fix GCC std flag

---
 .github/workflows/build.yml | 5 +++--
 build.rs                    | 2 +-
 2 files changed, 4 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index de854938..6a34106a 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -28,11 +28,12 @@ jobs:
         with:
           cuda: '12.4.1'
           method: 'network'
+          sub-packages: '[ "nvcc", "cudart", "nvrtc" ]'
         if: matrix.os != 'macOS-latest'
 
       - name: Clippy Check
-        run: cargo clippy --all-features -- -D warnings
+        run: cargo clippy -- -D warnings
         continue-on-error: true
 
       - name: Build
-        run: cargo build --all-features
+        run: cargo build
diff --git a/build.rs b/build.rs
index 90dfe86b..9d98b569 100644
--- a/build.rs
+++ b/build.rs
@@ -10,12 +10,12 @@ fn main() {
 
   if cfg!(target_env = "msvc") {
     build.flag("/experimental:c11atomics");
+    build.std("c11");
   }
 
   match build
       .file("src/hvm.c")
       .opt_level(3)
-      .std("c11")
       .warnings(false)
       .define("TPC_L2", &*tpcl2.to_string())
       .try_compile("hvm-c") {

From 99dcdd0375f204f9a760de493909e3042dbe45b7 Mon Sep 17 00:00:00 2001
From: Sepcnt <30561671+sepcnt@users.noreply.github.com>
Date: Tue, 21 May 2024 01:42:16 +0000
Subject: [PATCH 5/5] Update README.md

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index e483a994..ebca1e2b 100644
--- a/README.md
+++ b/README.md
@@ -22,7 +22,7 @@ check [Bend](https://github.com/HigherOrderCO/Bend) instead.
 Usage
 -----
 
-> DISCLAIMER: Windows is currently not supported, please use [WSL](https://learn.microsoft.com/en-us/windows/wsl/install) for now as a workaround.
+> For native Windows build, you will need at least Visual Studio 2022 version 17.5 for [C11 Atomics](https://devblogs.microsoft.com/cppblog/c11-atomics-in-visual-studio-2022-version-17-5-preview-2/) support.
 
 First install the dependencies:
 * If you want to use the C runtime, install a C-11 compatible compiler like GCC or Clang.