From b250ef67bea084c214f5c61f06eec9b9422e3d44 Mon Sep 17 00:00:00 2001
From: Bill <sunyinqi0508@gmail.com>
Date: Wed, 2 Nov 2022 05:00:09 +0800
Subject: [PATCH] initial commit

---
 README.md                |  24 +++++
 bench.py                 |  63 ++++++++++++
 gcc12.2.0deadlockbug.cpp |  26 +++++
 test.cpp                 | 204 +++++++++++++++++++++++++++++++++++++++
 4 files changed, 317 insertions(+)
 create mode 100644 README.md
 create mode 100644 bench.py
 create mode 100644 gcc12.2.0deadlockbug.cpp
 create mode 100644 test.cpp
diff --git a/README.md b/README.md
new file mode 100644
index 0000000..9b4f929
--- /dev/null
+++ b/README.md
@@ -0,0 +1,24 @@
+# Inter-thread Communication Benchmark
+
+This code tests the best way to collaborate between threads
+
+Platform tested:
+- Windows (MSVC, gcc-12.2, WSL-gcc11.2, WSL-clang-14)
+- macOS (clang-13, gcc-12.2)
+- Linux (gcc-12.1, gcc-11.3, clang-12)
+- arm64 Linux (gcc-12.1, clang-12)
+
+Method tested:
+- C++20 std::semaphores
+- Native semaphores 
+- Condition Variables
+- Busy waiting (spinlock) with or without delay
+
+Results:
+	macOS	linux	win, msc	win, gcc
+cv	6000	10000	700	26000
+busy	360	300	550	300
+nsmph	600	4418	6000	1200
+smph	240	1000/dl	660	dl
+
+Bug on g++ 12.2.0 might cause deadlock, see: gcc12.2.0deadlockbug.cpp
diff --git a/bench.py b/bench.py
new file mode 100644
index 0000000..02caffc
--- /dev/null
+++ b/bench.py
@@ -0,0 +1,63 @@
+import ctypes
+import threading
+import os
+# os.add_dll_directory('c:/mingw64/bin')
+
+test = ctypes.CDLL('./a.so')
+N = 100000
+print(0)
+h = threading.Thread(target=test['loop_acquire'], args = (N,), daemon = True)
+lock = test['acquire']
+h.start()
+for _ in range(N):
+	lock()
+h.join()
+
+print(1)
+h = threading.Thread(target=test['loop_aacquire'], args = (N,), daemon = True)
+lock = test['aacquire']
+h.start()
+for _ in range(N):
+	lock()
+h.join()
+
+print(2)
+h = threading.Thread(target=test['loop_lock'], args = (N,), daemon = True)
+lock = test['lock']
+h.start()
+for _ in range(N):
+	lock()
+h.join()
+
+print(3)
+h = threading.Thread(target=test['loop_flag'], args = (N,), daemon = True)
+set = test['set']
+h.start()
+for _ in range(N):
+	set()
+h.join()
+
+print(4)
+h = threading.Thread(target=test['loop_slp'], args = (N, 0), daemon = True)
+set = test['set']
+h.start()
+for _ in range(N):
+	set()
+h.join()
+
+#print(5)
+#h = threading.Thread(target=test['loop_slp'], args = (N, 1), daemon = True)
+#set = test['set']
+#h.start()
+#for _ in range(N):
+#	set()
+#h.join()
+
+print(6)
+h = threading.Thread(target=test['loop_atomic'], args = (N, ), daemon = True)
+set = test['atomic_set']
+h.start()
+for _ in range(N):
+	set()
+h.join()
+
diff --git a/gcc12.2.0deadlockbug.cpp b/gcc12.2.0deadlockbug.cpp
new file mode 100644
index 0000000..0b62157
--- /dev/null
+++ b/gcc12.2.0deadlockbug.cpp
@@ -0,0 +1,26 @@
+#include <thread>
+#include <semaphore>
+constexpr int loop = 100000;
+std::binary_semaphore a{0},  b{1};
+
+void producer() {
+  for(int i = 0; i < loop; ++i) {
+    a.acquire();
+    b.release();
+  }
+}
+
+void consumer() {
+  for(int i = 0; i < loop; ++i) {
+    b.acquire();
+    a.release();
+  }
+}
+
+int main() {
+  std::thread t1(producer);
+  std::thread t2(consumer);
+  t1.join();
+  t2.join();
+  puts("done");
+}
diff --git a/test.cpp b/test.cpp
new file mode 100644
index 0000000..82970f4
--- /dev/null
+++ b/test.cpp
@@ -0,0 +1,204 @@
+
+#ifdef _MSC_VER
+#define EXPORT _declspec(dllexport)
+#include <Windows.h>
+class A_Semaphore {
+private:
+	HANDLE native_handle;
+public:
+	A_Semaphore(bool v = false) {
+		native_handle = CreateSemaphore(NULL, v, 1, NULL);
+	}
+	void acquire() {
+		WaitForSingleObject(native_handle, INFINITE);
+	}
+	void release() {
+		ReleaseSemaphore(native_handle, 1, NULL);
+	}
+	~A_Semaphore() {
+		CloseHandle(native_handle);
+	}
+};
+#else 
+#define EXPORT
+#ifdef __APPLE__
+#include <dispatch/dispatch.h>
+class A_Semaphore {
+private:
+	dispatch_semaphore_t native_handle;
+public:
+	A_Semaphore(bool v = false) {
+		native_handle = dispatch_semaphore_create(v);
+	}
+	void acquire() {
+		dispatch_semaphore_wait(&native_handle, DISPATCH_TIME_FOREVER);
+	}
+	void release() {
+		dispatch_semaphore_signal(&native_handle);
+	}
+	~A_Semaphore() {
+	}
+};
+#else
+#include <semaphore.h>
+class A_Semaphore {
+private:
+	sem_t native_handle;
+public:
+	A_Semaphore(bool v = false) {
+		sem_init(&native_handle, v, 1);
+	}
+	void acquire() {
+		sem_wait(&native_handle);
+	}
+	void release() {
+		sem_post(&native_handle);
+	}
+	~A_Semaphore() {
+		sem_destroy(&native_handle);
+	}
+};
+#endif
+#endif
+A_Semaphore pp{ 0 }, cc{ 1 };
+#include <mutex>
+#include <chrono>
+#include <thread>
+#include <condition_variable>
+#include <semaphore>
+
+using namespace std;
+using namespace std::chrono_literals;
+mutex m;
+condition_variable cv;
+binary_semaphore producer{ 0 }, consumer{ 1 };
+chrono::high_resolution_clock::time_point now;
+int idx;
+bool ready = false;
+
+extern "C" EXPORT void acquire() {
+	consumer.acquire();
+	// work
+	producer.release();
+}
+
+extern "C" EXPORT void loop_acquire(int n) {
+	int i = n;
+	chrono::nanoseconds sum = 0ns;
+	unsigned long long k = 0;
+	now = chrono::high_resolution_clock::now();
+	while (i-- > 0) {
+		producer.acquire();
+		// work
+		// printf("%d ", i);
+		consumer.release();
+	}
+	sum += chrono::high_resolution_clock::now() - now;
+	printf("std::semaphore: %lld sum: %llu\n", sum.count(), k);
+}
+
+
+extern "C" EXPORT void aacquire() {
+	cc.acquire();
+	// work
+	pp.release();
+}
+
+extern "C" EXPORT void loop_aacquire(int n) {
+	int i = n;
+	chrono::nanoseconds sum = 0ns;
+	unsigned long long k = 0;
+	now = chrono::high_resolution_clock::now();
+	while (i-- > 0) {
+		pp.acquire();
+		// work
+		// printf("%d ", i);
+		cc.release();
+	}
+	sum += chrono::high_resolution_clock::now() - now;
+	printf("native semaphore: %lld sum: %llu\n", sum.count(), k);
+}
+
+extern "C" EXPORT void lock(){
+	static int n = 0;
+	unique_lock<mutex> lk(m);
+	cv.wait(lk, [] { return ready; });
+	ready = false;
+	lk.unlock();
+	cv.notify_one();
+}
+
+extern "C" EXPORT void loop_lock(int n){
+	int i = n;
+	chrono::nanoseconds sum = 0ns;
+	unsigned long long k = 0;
+	now = chrono::high_resolution_clock::now();
+	while(i-- > 0){
+		unique_lock<mutex> lk(m);
+		ready = true;
+		lk.unlock();
+		cv.notify_one();
+		lk.lock();
+		cv.wait(lk, [] {return !ready; });
+	}
+	sum += chrono::high_resolution_clock::now() - now;
+	printf("lock: %lld sum: %llu\n", sum.count(), k);
+}
+
+volatile bool flag = 0;
+extern "C" EXPORT void set(){
+	while (!flag); flag = false;
+}
+
+
+extern "C" EXPORT void loop_flag(int n){
+	int i = n;
+	unsigned s = 0;
+	chrono::nanoseconds sum = 0ns;
+	now = chrono::high_resolution_clock::now();
+	while(i > 0){
+		if (!flag) {
+			flag = true;
+			--i;
+		}
+	}
+	sum += chrono::high_resolution_clock::now() - now;
+	printf("flag: %lld s: %u\n", sum.count(), s);
+}
+
+#include <atomic>
+std::atomic<bool> af {0};
+extern "C" EXPORT void atomic_set(){
+	while (!af); af = false;
+}
+
+
+extern "C" EXPORT void loop_atomic(int n){
+	int i = n;
+	chrono::nanoseconds sum = 0ns;
+	now = chrono::high_resolution_clock::now();
+	while(i > 0){
+		if (!af) {
+			af = true;
+			--i;
+		}
+	}
+	sum += chrono::high_resolution_clock::now() - now;
+	printf("flag: %lld\n", sum.count());
+}
+
+
+extern "C" EXPORT void loop_slp(int n, int slp){
+	int i = n;
+	chrono::nanoseconds sum = 0ns;
+	now = chrono::high_resolution_clock::now();
+	while(i > 0){
+		if (!flag) {
+			flag = true;
+			--i;
+		}
+		std::this_thread::sleep_for(std::chrono::nanoseconds(slp));
+	}
+	sum += chrono::high_resolution_clock::now() - now;
+	printf("flag: %lld slp: %d\n", sum.count(), slp);
+}