@@ -804,9 +804,9 @@ __kernel void init_vm(__global const void* entropy_data, __global void* vm_state
804804 int32_t value = get_byte (registerReadCycle , dst );
805805 update_max (value , slot_to_use / WORKERS_PER_HASH );
806806 set_byte (registerReadCycle , dst , value );
807- ScratchpadLatency = slot_to_use / WORKERS_PER_HASH ;
807+ ScratchpadLatency = ( slot_to_use / WORKERS_PER_HASH ) + 1 ;
808808 if ((mod >> 4 ) >= StoreL3Condition )
809- ScratchpadHighLatency = slot_to_use / WORKERS_PER_HASH ;
809+ ScratchpadHighLatency = ( slot_to_use / WORKERS_PER_HASH ) + 1 ;
810810 }
811811 }
812812
@@ -1598,27 +1598,12 @@ double fma_soft(double a, double b, double c, uint32_t rounding_mode)
15981598
15991599double div_rnd (double a , double b , uint32_t fprc )
16001600{
1601- // Initial approximation
1602- double y0 ;
1603- #ifdef __NV_CL_C_VERSION
1604- asm("rcp.approx.ftz.f64 %0, %1;" : "=d" (y0 ) : "d" (b ));
1605- #else
1606- y0 = native_recip (b );
1607- #endif
1608-
1609- // Improve initial approximation (can be skipped)
1610- // 1 of 2^31 quotients will be incorrect in the last bit without it (1 incorrect hash per ~32768 hashes)
1611- #ifdef HIGH_PRECISION
1612- y0 = fma (y0 , fma (- b , y0 , 1.0 ), y0 );
1613- #endif
1601+ double y0 = 1.0 / b ;
16141602
1615- // First Newton-Raphson iteration
1616- const double y1 = fma (y0 , fma (- b , y0 , 1.0 ), y0 );
1617- const double t0 = a * y1 ;
1603+ // Do 1 Newton-Raphson iteration to get correct rounding
1604+ const double t0 = a * y0 ;
16181605 const double t1 = fma (- b , t0 , a );
1619-
1620- // Second Newton-Raphson iteration
1621- double result = fma_soft (y1 , t1 , t0 , fprc );
1606+ double result = fma_soft (y0 , t1 , t0 , fprc );
16221607
16231608 // Check for infinity/NaN
16241609 const uint64_t inf = 2047UL << 52 ;
@@ -1627,31 +1612,16 @@ double div_rnd(double a, double b, uint32_t fprc)
16271612 if (((as_ulong (result ) >> 52 ) & 2047 ) == 2047 ) result = as_double (inf_rnd );
16281613 if (as_ulong (a ) == inf ) result = a ;
16291614
1630- // Check for exact equality
1631- if (a == b ) result = 1.0 ;
1632-
1633- return result ;
1615+ return (a == b ) ? 1.0 : result ;
16341616}
16351617
16361618double sqrt_rnd (double x , uint32_t fprc )
16371619{
1638- // Initial approximation
1639- double y0 , t0 , t1 ;
1640- #ifdef __NV_CL_C_VERSION
1641- asm("rsqrt.approx.ftz.f64 %0, %1;" : "=d" (y0 ) : "d" (x ));
1642- #else
1643- y0 = native_rsqrt (x );
1644- #endif
1645-
1646- // Improve initial approximation (can be skipped)
1647- // 1 of 2^28 square roots will be incorrect in the last bit without it (1 incorrect hash per ~2731 hashes)
1648- #ifdef HIGH_PRECISION
1649- y0 = fma (y0 , fma (y0 * -0.5 , y0 * x , 0.5 ), y0 );
1650- #endif
1620+ double y0 = rsqrt (x );
16511621
16521622 // First Newton-Raphson iteration
1653- t0 = y0 * x ;
1654- t1 = y0 * -0.5 ;
1623+ double t0 = y0 * x ;
1624+ double t1 = y0 * -0.5 ;
16551625 t1 = fma (t1 , t0 , 0.5 ); // 0.5 * (1.0 - y0 * y0 * x)
16561626 const double y1_x = fma (t0 , t1 , t0 ); // y1 * x = 0.5 * y0 * x * (3.0 - y0 * y0 * x)
16571627
0 commit comments