Skip to content

Conversation

@any1
Copy link

@any1 any1 commented Oct 2, 2025

This implements a simple version of oneapi::tbb::parallel_invoke using plain pthreads.

The pthread implementation is slightly less performant, but it may be better suited for light-weight embedded systems than TBB and C++.

Before bifurcating execution, a thread is allocated from a list of free threads or it is created if the total number of threads is less than the number of cores on the system.

If there is no thread available, both the left and right subtrees are processed.

If a thread could be allocated, the thread is given an execution context for the left subtree and instructed to start execution via a barrier.

The current thread continues to process the right subtree and synchronises again with the allocated thread via a barrier and returns it to the list of free threads.

Simple Benchmarks

I tested this implementation using a b3sum implementation of my own which I've included below. I ran it against a 1GB large file in tmpfs. By the way, would you be interested in a PR to include this program?

With TBB:

$ perf stat ./b3sum /tmp/1G
73ad29a9072e92a9e5e67f6eaf977b25ed1aaca907ac334fe0f72f12f315a692  /tmp/1G

 Performance counter stats for './b3sum /tmp/1G':

          1,337.83 msec task-clock:u                     #   14.798 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
            17,513      page-faults:u                    #   13.091 K/sec                     
     2,982,215,674      instructions:u                   #    0.63  insn per cycle            
                                                  #    0.08  stalled cycles per insn   
     4,701,093,915      cycles:u                         #    3.514 GHz                       
       231,391,544      stalled-cycles-frontend:u        #    4.92% frontend cycles idle      
        59,270,444      branches:u                       #   44.303 M/sec                     
         1,489,339      branch-misses:u                  #    2.51% of all branches           

       0.090405527 seconds time elapsed

       1.107422000 seconds user
       0.223432000 seconds sys

With pthread

$ perf stat ./b3sum /tmp/1G
73ad29a9072e92a9e5e67f6eaf977b25ed1aaca907ac334fe0f72f12f315a692  /tmp/1G

 Performance counter stats for './b3sum /tmp/1G':

          1,093.70 msec task-clock:u                     #   11.031 CPUs utilized             
                 0      context-switches:u               #    0.000 /sec                      
                 0      cpu-migrations:u                 #    0.000 /sec                      
            17,036      page-faults:u                    #   15.576 K/sec                     
     2,746,500,238      instructions:u                   #    0.98  insn per cycle            
                                                  #    0.04  stalled cycles per insn   
     2,791,463,311      cycles:u                         #    2.552 GHz                       
       111,535,892      stalled-cycles-frontend:u        #    4.00% frontend cycles idle      
        20,932,525      branches:u                       #   19.139 M/sec                     
           487,552      branch-misses:u                  #    2.33% of all branches           

       0.099144397 seconds time elapsed

       0.745935000 seconds user
       0.317554000 seconds sys

b3sum.c

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <stdbool.h>
#include <stdint.h>
#include <string.h>
#include <getopt.h>
#include <fcntl.h>
#include <sys/mman.h>
#include <sys/stat.h>

#include <blake3.h>

static size_t bin_to_hex(char *dst, size_t dstlen, const uint8_t* src, size_t srclen)
{
	static char lut[] = {
		'0', '1', '2', '3',
		'4', '5', '6', '7',
		'8', '9', 'a', 'b',
		'c', 'd', 'e', 'f'
	};

	size_t k = 0;
	for (size_t i = 0; i < srclen && k + 2 < dstlen; ++i) {
		dst[k++] = lut[(src[i] & 0xf0) >> 4];
		dst[k++] = lut[src[i] & 0x0f];
	}
	dst[k] = '\0';

	return k;
}

static int hash_file(uint8_t *out, size_t outlen, const char *path)
{
	int ret = -1;

	struct stat st = {};
	if (stat(path, &st) < 0) {
		fprintf(stderr, "stat %s: %m\n", path);
		return -1;
	}

	int fd = open(path, O_RDONLY);
	if (fd == -1) {
		fprintf(stderr, "open %s: %m\n", path);
		return -1;
	}

	uint8_t *data = mmap(NULL, st.st_size, PROT_READ, MAP_PRIVATE, fd, 0);
	if (!data || data == MAP_FAILED) {
		fprintf(stderr, "mmap %s: %m\n", path);
		goto out;
	}

	blake3_hasher hash;
	blake3_hasher_init(&hash);
#ifdef BLAKE3_USE_TBB
	blake3_hasher_update_tbb(&hash, data, st.st_size);
#else
	blake3_hasher_update(&hash, data, st.st_size);
#endif
	blake3_hasher_finalize(&hash, out, outlen);

	ret = 0;
out:
	if (data && data != MAP_FAILED)
		munmap(data, st.st_size);
	if (fd >= 0)
		close(fd);
	return ret;
}

static int hash_stream(uint8_t *out, size_t outlen, FILE *stream)
{
	size_t buffer_size = 65536;
	uint8_t *buffer = malloc(buffer_size);
	if (!buffer) {
		fprintf(stderr, "OOM\n");
		return -1;
	}

	blake3_hasher hash;
	blake3_hasher_init(&hash);

	for (;;) {
		size_t n_read = fread(buffer, 1, buffer_size, stream);
		if (n_read == 0)
			break;

		blake3_hasher_update(&hash, buffer, n_read);
	}

	free(buffer);

	blake3_hasher_finalize(&hash, out, outlen);

	return 0;
}

static int show_checksum(const char *path)
{
	uint8_t out[32];
	
	if (strcmp(path, "-") == 0) {
		if (hash_stream(out, sizeof(out), stdin) < 0) {
			return -1;
		}
	} else {
		if (hash_file(out, sizeof(out), path) < 0) {
			return -1;
		}
	}

	char hex[sizeof(out) * 2 + 1];
	bin_to_hex(hex, sizeof(hex), out, sizeof(out));

	printf("%s  %s\n", hex, path);
	return 0;
}

static int check_sum_file(const char *path)
{
	int res = 0;
	FILE *fp = NULL;

	if (strcmp(path, "-") == 0) {
		fp = stdin;
	} else {
		fp = fopen(path, "r");
		if (!fp) {
			fprintf(stderr, "fopen %s: %m\n", path);
			return -1;
		}
	}

	char *line = NULL;
	size_t linesize = 0;

	for (;;) {
		ssize_t len = getline(&line, &linesize, fp);
		if (len < 0)
			break;

		char *tok = strtok(line, " \t\n");
		if (!tok)
			continue;

		char *sum_hex = tok;
		char *path = strtok(NULL, " \t\n");

		if (!sum_hex || !path) {
			res = -1;
			fprintf(stderr, "ERROR: Badly formed input for check\n");
			break;
		}

		uint8_t actual_sum[32];
		if (hash_file(actual_sum, sizeof(actual_sum), path) < 0) {
			res = -1;
			continue;
		}

		char actual_sum_hex[sizeof(actual_sum) * 2 + 1];
		bin_to_hex(actual_sum_hex, sizeof(actual_sum_hex), actual_sum, sizeof(actual_sum));

		if (strcasecmp(sum_hex, actual_sum_hex) == 0) {
			printf("%s: OK\n", path);
		} else {
			res = -1;
			printf("%s: FAILED\n", path);
		}

	}

	free(line);
	if (fp != stdin) {
		fclose(fp);
	}
	return res;
}

static int generate_sums(int n_args, char **args)
{
	if (n_args == 0) {
		return show_checksum("-") == 0 ? 0 : 1;
	}

	int ret = 0;

	for (int i = 0; i < n_args; ++i) {
		ret |= show_checksum(args[i]) < 0 ? 1 : 0;
	}

	return ret;
}

static int check_sums(int n_args, char **args)
{
	if (n_args == 0) {
		return check_sum_file("-");
	}

	int ret = 0;

	for (int i = 0; i < n_args; ++i) {
		ret |= check_sum_file(args[i]) < 0 ? 1 : 0;
	}

	return ret;
}

static int usage(int r)
{
	FILE *stream = r == 0 ? stdout : stderr;
	fprintf(stream,
"Usage: b3sum [options] [file]...\n"
"\n"
"Summary:\n"
"    This is a limited C implementation of b3sum which is otherwise implemented\n"
"    in Rust.\n"
"\n"
"Options:\n"
"    -c, --check    Read BLAKE3 sums from the [FILE]s and check them\n"
"    -h, --help     Get help\n");
	return r;
}

int main(int argc, char *argv[])
{
	static const char short_opts[] = "hc";

	static const struct option long_opts[] = {
		{ "help", no_argument, 0, 'h' },
		{ "check", no_argument, 0, 'c' },
		{ }
	};

	bool check = false;

	for (;;) {
		int c = getopt_long(argc, argv, short_opts, long_opts, NULL);
		if (c == -1)
			break;

		switch (c) {
		case 'h':
			return usage(0);
		case 'c':
			check = true;
			break;
		}
	}

	int n_args = argc - optind;
	char **pos_args = argv + optind;

	if (check) {
		return check_sums(n_args, pos_args);
	}

	return generate_sums(n_args, pos_args);
}

This implements a simple version of oneapi::tbb::parallel_invoke using
plain pthreads.

The pthread implementation is slightly less performant, but it may be
better suited for light-weight embedded systems than TBB and C++.

Before bifurcating execution, a thread is allocated from a list of free
threads or it is created if the total number of threads is less than the
number of cores on the system.

If there is no thread available, both the left and right subtrees are
processed.

If a thread could be allocated, the thread is given an execution context
for the left subtree and instructed to start execution via a barrier.

The current thread continues to process the right subtree and
synchronises again with the allocated thread via a barrier and returns
it to the list of free threads.
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant