Ok, I like the looks of this approximation:
(from https://stackoverflow.com/questions/4930307/fastest-way-to-get-the-integer-part-of-sqrtn/63457507#63457507 and https://stackoverflow.com/questions/34187171/fast-integer-square-root-approximation)
#include <algorithm>
#include <array>
#include <iostream>
#include <string>
#include <vector>
#include <math.h>
#include <time.h>
unsigned char bit_width(unsigned long long x) {
return x == 0 ? 1 : 64 - __builtin_clzll(x);
}
unsigned sqrt_lo(const unsigned n) noexcept
{
unsigned log2floor = bit_width(n) - 1;
return (unsigned) (n != 0) << (log2floor >> 1);
}
unsigned sqrt_newton(const unsigned n, unsigned int P)
{
unsigned a = sqrt_lo(n);
unsigned b = n;
// compute unsigned difference
while (std::max(a, b) - std::min(a, b) > P*P) { //P*P reduces iterations of while loop when using higher P.
//printf("iteration\n");
b = n / a;
a = (a + b) / 2;
}
// a is now either floor(sqrt(n)) or ceil(sqrt(n))
// we decrement in the latter case
// this is overflow-safe as long as we start with a lower bound guess
return a - (a * a > n); //could remove - (a * a > n); if using low precision.
}
int main()
{
unsigned Nums[29] = {27,28,29,30,31,32,33,34,35,36,37,38,39,40,44,45,46,47,48,58,59,60,61,62,98,99,100,101,102};
int P = 5;
clock_t tStart = clock();
printf("Range value\tEstimate\tExact\n");
for (int i=0;i<29;i++){
unsigned test = Nums[i]*Nums[i];
printf("%i\t",Nums[i]);
printf("%i\t",sqrt_newton(test,P)/P);
printf("%i\n",(int)floor(sqrt(test))/P);
}
//printf("Estimate: %i\t",sqrt_newton(test)/P);
//printf("Exact: %i\n",(int)floor(sqrt(test))/P);
printf("Time: %.9fs\n", (double)(clock() - tStart)/CLOCKS_PER_SEC);
}
results:
P = 1
P = 3:
P = 5
P = 10:
The only issue I found is that with 5 >= P >= 9, there are a couple points where the estimate is higher than estimates of larger ranges. (see P = 5)
I could just go ahead and use this and see what happens. (id have to figure out the windows build process first).