Skip to content

Commit 83f2f53

Browse files
committed
Fixed invalid hashes with portable code
1 parent eacef2f commit 83f2f53

File tree

1 file changed

+10
-40
lines changed

1 file changed

+10
-40
lines changed

RandomX_OpenCL/CL/randomx_vm.cl

Lines changed: 10 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -804,9 +804,9 @@ __kernel void init_vm(__global const void* entropy_data, __global void* vm_state
804804
int32_t value = get_byte(registerReadCycle, dst);
805805
update_max(value, slot_to_use / WORKERS_PER_HASH);
806806
set_byte(registerReadCycle, dst, value);
807-
ScratchpadLatency = slot_to_use / WORKERS_PER_HASH;
807+
ScratchpadLatency = (slot_to_use / WORKERS_PER_HASH) + 1;
808808
if ((mod >> 4) >= StoreL3Condition)
809-
ScratchpadHighLatency = slot_to_use / WORKERS_PER_HASH;
809+
ScratchpadHighLatency = (slot_to_use / WORKERS_PER_HASH) + 1;
810810
}
811811
}
812812

@@ -1598,27 +1598,12 @@ double fma_soft(double a, double b, double c, uint32_t rounding_mode)
15981598

15991599
double div_rnd(double a, double b, uint32_t fprc)
16001600
{
1601-
// Initial approximation
1602-
double y0;
1603-
#ifdef __NV_CL_C_VERSION
1604-
asm("rcp.approx.ftz.f64 %0, %1;" : "=d"(y0) : "d"(b));
1605-
#else
1606-
y0 = native_recip(b);
1607-
#endif
1608-
1609-
// Improve initial approximation (can be skipped)
1610-
// 1 of 2^31 quotients will be incorrect in the last bit without it (1 incorrect hash per ~32768 hashes)
1611-
#ifdef HIGH_PRECISION
1612-
y0 = fma(y0, fma(-b, y0, 1.0), y0);
1613-
#endif
1601+
double y0 = 1.0 / b;
16141602

1615-
// First Newton-Raphson iteration
1616-
const double y1 = fma(y0, fma(-b, y0, 1.0), y0);
1617-
const double t0 = a * y1;
1603+
// Do 1 Newton-Raphson iteration to get correct rounding
1604+
const double t0 = a * y0;
16181605
const double t1 = fma(-b, t0, a);
1619-
1620-
// Second Newton-Raphson iteration
1621-
double result = fma_soft(y1, t1, t0, fprc);
1606+
double result = fma_soft(y0, t1, t0, fprc);
16221607

16231608
// Check for infinity/NaN
16241609
const uint64_t inf = 2047UL << 52;
@@ -1627,31 +1612,16 @@ double div_rnd(double a, double b, uint32_t fprc)
16271612
if (((as_ulong(result) >> 52) & 2047) == 2047) result = as_double(inf_rnd);
16281613
if (as_ulong(a) == inf) result = a;
16291614

1630-
// Check for exact equality
1631-
if (a == b) result = 1.0;
1632-
1633-
return result;
1615+
return (a == b) ? 1.0 : result;
16341616
}
16351617

16361618
double sqrt_rnd(double x, uint32_t fprc)
16371619
{
1638-
// Initial approximation
1639-
double y0, t0, t1;
1640-
#ifdef __NV_CL_C_VERSION
1641-
asm("rsqrt.approx.ftz.f64 %0, %1;" : "=d"(y0) : "d"(x));
1642-
#else
1643-
y0 = native_rsqrt(x);
1644-
#endif
1645-
1646-
// Improve initial approximation (can be skipped)
1647-
// 1 of 2^28 square roots will be incorrect in the last bit without it (1 incorrect hash per ~2731 hashes)
1648-
#ifdef HIGH_PRECISION
1649-
y0 = fma(y0, fma(y0 * -0.5, y0 * x, 0.5), y0);
1650-
#endif
1620+
double y0 = rsqrt(x);
16511621

16521622
// First Newton-Raphson iteration
1653-
t0 = y0 * x;
1654-
t1 = y0 * -0.5;
1623+
double t0 = y0 * x;
1624+
double t1 = y0 * -0.5;
16551625
t1 = fma(t1, t0, 0.5); // 0.5 * (1.0 - y0 * y0 * x)
16561626
const double y1_x = fma(t0, t1, t0); // y1 * x = 0.5 * y0 * x * (3.0 - y0 * y0 * x)
16571627

0 commit comments

Comments
 (0)