brichard19 · ByLamacq · Mar 14, 2020 · Mar 18, 2020 · Mar 18, 2020 · Apr 21, 2021
diff --git a/BitCrack.props b/BitCrack.props
@@ -2,10 +2,10 @@
 <Project ToolsVersion="4.0" xmlns="http://schemas.microsoft.com/developer/msbuild/2003">
   <ImportGroup Label="PropertySheets" />
   <PropertyGroup Label="UserMacros">
-	<CUDA_INCLUDE>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include</CUDA_INCLUDE>
-	<CUDA_LIB>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\lib\x64</CUDA_LIB>
-    <OPENCL_INCLUDE>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\include</OPENCL_INCLUDE>
-    <OPENCL_LIB>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v10.1\lib\x64</OPENCL_LIB>
+	<CUDA_INCLUDE>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.3\include</CUDA_INCLUDE>
+	<CUDA_LIB>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.3\lib\x64</CUDA_LIB>
+    <OPENCL_INCLUDE>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.3\include</OPENCL_INCLUDE>
+    <OPENCL_LIB>C:\Program Files\NVIDIA GPU Computing Toolkit\CUDA\v11.3\lib\x64</OPENCL_LIB>
   </PropertyGroup>
   <PropertyGroup />
   <ItemDefinitionGroup />

diff --git a/CudaKeySearchDevice/CudaDeviceKeys.cu b/CudaKeySearchDevice/CudaDeviceKeys.cu
@@ -22,8 +22,22 @@ __device__ unsigned int *ec::getYPtr()
 	return _yPtr[0];
 }
 
-__global__ void multiplyStepKernel(const unsigned int *privateKeys, int pointsPerThread, int step, unsigned int *chain, const unsigned int *gxPtr, const unsigned int *gyPtr);
+__global__ void multiplyStepKernel(unsigned int *privateKeys, int pointsPerThread, int step, unsigned int *chain, const unsigned int *gxPtr, const unsigned int *gyPtr);
 
+unsigned int * CudaDeviceKeys::getXPoints()
+{
+	return _devX;
+}
+
+unsigned int * CudaDeviceKeys::getYPoints()
+{
+	return _devY;
+}
+
+unsigned int * CudaDeviceKeys::getChain()
+{
+	return _devChain;
+}
 
 int CudaDeviceKeys::getIndex(int block, int thread, int idx)
 {
@@ -45,15 +59,22 @@ void CudaDeviceKeys::splatBigInt(unsigned int *dest, int block, int thread, int
 	i.exportWords(value, 8, secp256k1::uint256::BigEndian);
 
 	int totalThreads = _blocks * _threads;
-	int threadId = block * _threads + thread;
+	int threadId = block * _threads * 4 + thread * 4;
 
 	int base = idx * _blocks * _threads * 8;
 
 	int index = base + threadId;
 
-	for(int k = 0; k < 8; k++) {
+	for(int k = 0; k < 4; k++) {
 		dest[index] = value[k];
-		index += totalThreads;
+		index++;
+	}
+
+	index = base + totalThreads * 4 + threadId;
+
+	for(int k = 4; k < 8; k++) {
+		dest[index] = value[k];
+		index++;
 	}
 }
 
@@ -62,15 +83,22 @@ secp256k1::uint256 CudaDeviceKeys::readBigInt(unsigned int *src, int block, int
 	unsigned int value[8] = { 0 };
 
 	int totalThreads = _blocks * _threads;
-	int threadId = block * _threads + thread;
+	int threadId = block * _threads * 4 + thread * 4;
 
 	int base = idx * _blocks * _threads * 8;
 
 	int index = base + threadId;
 
-	for(int k = 0; k < 8; k++) {
+	for(int k = 0; k < 4; k++) {
+		value[k] = src[index];
+		index++;
+	}
+
+	index = base + totalThreads * 4 + threadId;
+
+	for(int k = 4; k < 8; k++) {
 		value[k] = src[index];
-		index += totalThreads;
+		index++;
 	}
 
 	secp256k1::uint256 v(value, secp256k1::uint256::BigEndian);
@@ -255,9 +283,11 @@ cudaError_t CudaDeviceKeys::init(int blocks, int threads, int pointsPerThread, c
 
 void CudaDeviceKeys::clearPublicKeys()
 {
+	cudaFree(_devChain);
 	cudaFree(_devX);
 	cudaFree(_devY);
 
+	_devChain = NULL;
 	_devX = NULL;
 	_devY = NULL;
 }
@@ -267,9 +297,9 @@ void CudaDeviceKeys::clearPrivateKeys()
 	cudaFree(_devBasePointX);
 	cudaFree(_devBasePointY);
 	cudaFree(_devPrivate);
-	cudaFree(_devChain);
+	//cudaFree(_devChain);
 
-	_devChain = NULL;
+	//_devChain = NULL;
 	_devBasePointX = NULL;
 	_devBasePointY = NULL;
 	_devPrivate = NULL;
@@ -286,7 +316,7 @@ cudaError_t CudaDeviceKeys::doStep()
 	return err;
 }
 
-__global__ void multiplyStepKernel(const unsigned int *privateKeys, int pointsPerThread, int step, unsigned int *chain, const unsigned int *gxPtr, const unsigned int *gyPtr)
+__global__ void multiplyStepKernel(unsigned int *privateKeys, int pointsPerThread, int step, unsigned int *chain, const unsigned int *gxPtr, const unsigned int *gyPtr)
 {
 	unsigned int *xPtr = ec::getXPtr();
 
@@ -394,4 +424,4 @@ bool CudaDeviceKeys::selfTest(const std::vector<secp256k1::uint256> &privateKeys
 	}
 
 	return true;
-}
+}
diff --git a/CudaKeySearchDevice/CudaDeviceKeys.h b/CudaKeySearchDevice/CudaDeviceKeys.h
@@ -76,6 +76,9 @@ class CudaDeviceKeys {
 
 	void clearPublicKeys();
 
+	unsigned int * getXPoints();
+	unsigned int * getYPoints();
+	unsigned int * getChain();
 };
 
-#endif
+#endif
diff --git a/CudaKeySearchDevice/CudaKeyCheckDevice.cu b/CudaKeySearchDevice/CudaKeyCheckDevice.cu
@@ -0,0 +1,167 @@
+#include <cuda.h>
+#include <cuda_runtime.h>
+#include <device_launch_parameters.h>
+#include "KeySearchTypes.h"
+#include "CudaKeySearchDevice.h"
+#include "ptx.cuh"
+#include "secp256k1.cuh"
+
+#include "sha256.cuh"
+#include "ripemd160.cuh"
+
+#include "secp256k1.h"
+
+#include "CudaHashLookup.cuh"
+#include "CudaAtomicList.cuh"
+#include "CudaDeviceKeys.cuh"
+
+
+__device__ void doRMD160FinalRound(const unsigned int hIn[5], unsigned int hOut[5])
+{
+    const unsigned int iv[5] = {
+        0x67452301,
+        0xefcdab89,
+        0x98badcfe,
+        0x10325476,
+        0xc3d2e1f0
+    };
+
+    for(int i = 0; i < 5; i++) {
+        hOut[i] = endian(hIn[i] + iv[(i + 1) % 5]);
+    }
+}
+
+__device__ void hashPublicKey(const unsigned int *x, const unsigned int *y, unsigned int *digestOut)
+{
+    unsigned int hash[8];
+
+    sha256PublicKey(x, y, hash);
+
+    // Swap to little-endian
+    for(int i = 0; i < 8; i++) {
+        hash[i] = endian(hash[i]);
+    }
+
+    ripemd160sha256NoFinal(hash, digestOut);
+}
+
+__device__ void hashPublicKeyCompressed(const unsigned int *x, unsigned int yParity, unsigned int *digestOut)
+{
+    unsigned int hash[8];
+
+    sha256PublicKeyCompressed(x, yParity, hash);
+
+    // Swap to little-endian
+    for(int i = 0; i < 8; i++) {
+        hash[i] = endian(hash[i]);
+    }
+
+    ripemd160sha256NoFinal(hash, digestOut);
+}
+
+__device__ void setResultFound(const int numBlocks, bool compressed, unsigned int x[8], unsigned int y[8], unsigned int digest[5])
+{
+    CudaDeviceResult r;
+
+    unsigned int virtualPointsPerThread = blockIdx.x / numBlocks; // ex: (uint) 8 / 6 = 1
+    unsigned int virtualBlock = blockIdx.x % numBlocks;
+
+    r.block = virtualBlock;
+    r.thread = threadIdx.x;
+    r.idx = virtualPointsPerThread;
+    r.compressed = compressed;
+
+    for(int i = 0; i < 8; i++) {
+        r.x[i] = x[i];
+        r.y[i] = y[i];
+    }
+
+    doRMD160FinalRound(digest, r.digest);
+
+    atomicListAdd(&r, sizeof(r));
+}
+
+/**
+ * Reads an 2-vector4 big integer from device memory with virtualized loop
+ */
+__device__ static void readIntVirtualized(unsigned int *ara, const int numBlocks, unsigned int x[8])
+{
+    uint4 *araTmp = reinterpret_cast<uint4 *>(ara);
+
+    unsigned int virtualPointsPerThread = blockIdx.x / numBlocks; // ex: (uint) 8 / 6 = 1
+    unsigned int virtualBlock = blockIdx.x % numBlocks; // ex: 8 % 6 = 2
+    unsigned int totalThreads = numBlocks * blockDim.x; // 6 * 128 = 768
+    unsigned int base = virtualPointsPerThread * totalThreads * 2; // 1 * 768
+    unsigned int threadId = blockDim.x * virtualBlock + threadIdx.x; // 128 * 2 * ... 
+    unsigned int index = base + threadId;
+
+    uint4 xTmp = araTmp[index];
+    x[0] = xTmp.x;
+    x[1] = xTmp.y;
+    x[2] = xTmp.z;
+    x[3] = xTmp.w;
+
+    index += totalThreads;
+
+    xTmp = araTmp[index];
+    x[4] = xTmp.x;
+    x[5] = xTmp.y;
+    x[6] = xTmp.z;
+    x[7] = xTmp.w;
+}
+
+__device__ static unsigned int readIntLSWVirtualized(unsigned int *ara, const int numBlocks)
+{
+    uint4 *araTmp = reinterpret_cast<uint4 *>(ara);
+
+    unsigned int virtualPointsPerThread = blockIdx.x / numBlocks; // ex: (uint) 8 / 6 = 1
+    unsigned int virtualBlock = blockIdx.x % numBlocks; // ex: 8 % 6 = 2
+    unsigned int totalThreads = numBlocks * blockDim.x; // 6 * 128 = 768
+    unsigned int base = virtualPointsPerThread * totalThreads * 2; // 1 * 768
+    unsigned int threadId = blockDim.x * virtualBlock + threadIdx.x; // 128 * 2 * ... 
+    unsigned int index = base + threadId;
+
+    index += totalThreads;
+
+    uint4 xTmp = araTmp[index];
+
+    return xTmp.w;
+}
+
+__device__ void hashAndCheck(unsigned int *xPtr, unsigned int *yPtr, const int numBlocks, const int compression)
+{
+    unsigned int x[8];
+    unsigned int digest[5];
+
+    readIntVirtualized(xPtr, numBlocks, x);
+
+    if(compression == PointCompressionType::UNCOMPRESSED || compression == PointCompressionType::BOTH) {
+        unsigned int y[8];
+        readIntVirtualized(yPtr, numBlocks, y);
+
+        hashPublicKey(x, y, digest);
+
+        if(checkHash(digest)) {
+            setResultFound(numBlocks, false, x, y, digest);
+        }
+    }
+
+    if(compression == PointCompressionType::COMPRESSED || compression == PointCompressionType::BOTH) {
+        hashPublicKeyCompressed(x, readIntLSWVirtualized(yPtr, numBlocks), digest);
+
+        if(checkHash(digest)) {
+            unsigned int y[8];
+            readIntVirtualized(yPtr, numBlocks, y);
+            setResultFound(numBlocks, true, x, y, digest);
+        }
+    }
+}
+
+/**
+* Performs a single iteration
+*/
+__global__ void keyCheckKernel(unsigned int *xPtr, unsigned int *yPtr, const int blocks, const int compression)
+{
+    hashAndCheck(xPtr, yPtr, blocks, compression);
+}
+
diff --git a/CudaKeySearchDevice/CudaKeySearchDevice.cpp b/CudaKeySearchDevice/CudaKeySearchDevice.cpp
@@ -80,7 +80,7 @@ void CudaKeySearchDevice::init(const secp256k1::uint256 &start, int compression,
 
     generateStartingPoints();
 
-    cudaCall(allocateChainBuf(_threads * _blocks * _pointsPerThread));
+    //cudaCall(allocateChainBuf(_threads * _blocks * _pointsPerThread));
 
     // Set the incrementor
     secp256k1::ecpoint g = secp256k1::G();
@@ -148,9 +148,9 @@ void CudaKeySearchDevice::doStep()
 
     try {
         if(_iterations < 2 && _startExponent.cmp(numKeys) <= 0) {
-            callKeyFinderKernel(_blocks, _threads, _pointsPerThread, true, _compression);
+            callKeyFinderKernel(_blocks, _threads, _pointsPerThread, _deviceKeys.getXPoints(), _deviceKeys.getYPoints(), _deviceKeys.getChain(), true, _compression);
         } else {
-            callKeyFinderKernel(_blocks, _threads, _pointsPerThread, false, _compression);
+            callKeyFinderKernel(_blocks, _threads, _pointsPerThread, _deviceKeys.getXPoints(), _deviceKeys.getYPoints(), _deviceKeys.getChain(), false, _compression);
         }
     } catch(cuda::CudaException ex) {
         throw KeySearchException(ex.msg);
@@ -313,4 +313,4 @@ secp256k1::uint256 CudaKeySearchDevice::getNextKey()
     uint64_t totalPoints = (uint64_t)_pointsPerThread * _threads * _blocks;
 
     return _startExponent + secp256k1::uint256(totalPoints) * _iterations * _stride;
-}
+}