1
0

create optimized asin and harvensine functions using handmade trig

This commit is contained in:
Rokas Puzonas 2025-06-24 21:03:29 +03:00
parent c59b28926f
commit fe5318b4f2
4 changed files with 470 additions and 91 deletions

View File

@ -1,4 +1,4 @@
CFLAGS=-lm -g -Wall -O3 -mavx2
CFLAGS=-lm -g -Wall -O2 -mavx2 -mfma
.PHONY := gen_data main

View File

@ -1,62 +1,94 @@
#include <float.h>
#include <math.h>
#include <sys/param.h>
#include <immintrin.h>
typedef double f64;
#define EARTH_RADIUS 6372.8
struct counters {
f64 min_input;
f64 max_input;
f64 min_output;
f64 max_output;
};
#define EMPTY_COUNTERS (struct counters){ .min_input = DBL_MAX, .max_input = -DBL_MAX, .min_output = DBL_MAX, .max_output = -DBL_MAX }
struct counters cos_counters = EMPTY_COUNTERS;
struct counters sin_counters = EMPTY_COUNTERS;
struct counters asin_counters = EMPTY_COUNTERS;
struct counters sqrt_counters = EMPTY_COUNTERS;
#define LOG_COUNTERS(counters, func, a) do { \
countres.min_input = MIN(countres.min_input, a) \
} while (0)
static f64 log_counters(struct counters *counters, f64 (*calc)(f64), f64 input)
{
counters->min_input = MIN(counters->min_input, input);
counters->max_input = MAX(counters->max_input, input);
f64 output = calc(input);
counters->min_output = MIN(counters->min_output, output);
counters->max_output = MAX(counters->max_output, output);
return output;
}
static f64 sqaure_f64(f64 num)
{
return num*num;
}
static f64 my_sin(f64 num)
static f64 my_sin(f64 input)
{
return log_counters(&sin_counters, sin, num);
f64 sign = 1;
if (input < 0) {
sign *= -1;
input = -input;
}
if (input > M_PI / 2) {
input = M_PI - input;
}
f64 input_sqr = input * input;
f64 result = 0x1.883c1c5deffbep-49;
result = fma(result, input_sqr, -0x1.ae43dc9bf8ba7p-41);
result = fma(result, input_sqr, 0x1.6123ce513b09fp-33);
result = fma(result, input_sqr, -0x1.ae6454d960ac4p-26);
result = fma(result, input_sqr, 0x1.71de3a52aab96p-19);
result = fma(result, input_sqr, -0x1.a01a01a014eb6p-13);
result = fma(result, input_sqr, 0x1.11111111110c9p-7);
result = fma(result, input_sqr, -0x1.5555555555555p-3);
result = fma(result, input_sqr, 0x1p0);
result *= input;
return result * sign;
}
static f64 my_cos(f64 num)
static f64 my_cos(f64 input)
{
return log_counters(&cos_counters, cos, num);
return my_sin(input + M_PI/2);
}
static f64 my_sqrt(f64 num)
{
return log_counters(&sqrt_counters, sqrt, num);
__m128d mm_num = _mm_set_sd(num);
__m128d mm_result = _mm_sqrt_sd(mm_num, mm_num);
f64 result = _mm_cvtsd_f64(mm_result);
return result;
}
static f64 my_asin(f64 num)
static f64 my_asin(f64 input)
{
return log_counters(&asin_counters, asin, num);
f64 scalar = 1;
f64 offset = 0;
f64 one_over_sqrt_2 = 0.707106781186547461715008; // 1/sqrt(2);
if (input > one_over_sqrt_2) {
input = my_sqrt(1 - input * input);
scalar = -1;
offset = M_PI/2;
}
f64 input_sqr = input * input;
f64 result = 0x1.3070aa6a5b88ep0;
result = fma(result, input_sqr, -0x1.2ee507d6a1a5fp2);
result = fma(result, input_sqr, 0x1.1ea75d01d0debp3);
result = fma(result, input_sqr, -0x1.500975aa37fb8p3);
result = fma(result, input_sqr, 0x1.106a078008a9ap3);
result = fma(result, input_sqr, -0x1.411c99c675e12p2);
result = fma(result, input_sqr, 0x1.1debe7d3f064p1);
result = fma(result, input_sqr, -0x1.7f8907c1978c3p-1);
result = fma(result, input_sqr, 0x1.a361973086e84p-3);
result = fma(result, input_sqr, -0x1.018ae6d82506cp-5);
result = fma(result, input_sqr, 0x1.050a65cdec399p-6);
result = fma(result, input_sqr, 0x1.62b81445afbfdp-7);
result = fma(result, input_sqr, 0x1.cbb180b74d85dp-7);
result = fma(result, input_sqr, 0x1.1c3da3ac97e63p-6);
result = fma(result, input_sqr, 0x1.6e8c66fac48d5p-6);
result = fma(result, input_sqr, 0x1.f1c716a8f3363p-6);
result = fma(result, input_sqr, 0x1.6db6db7adc18bp-5);
result = fma(result, input_sqr, 0x1.3333333323ebcp-4);
result = fma(result, input_sqr, 0x1.55555555555bap-3);
result = fma(result, input_sqr, 0x1p0);
result *= input;
return fma(result, scalar, offset);
}
static f64 radians_to_degrees(f64 degrees)

View File

@ -89,6 +89,67 @@ static void free_point_pairs(struct point_pairs *pairs)
free(pairs);
}
static f64 sum_harvensines_a(struct point_pairs *pairs)
{
f64 sum = 0;
for (int i = 0; i < pairs->count; i++) {
struct point_pair *p = &pairs->pairs[i];
sum += get_harvensine_distance(p->x0, p->y0, p->x1, p->y1, EARTH_RADIUS);
}
return sum;
}
static f64 sum_harvensines_b(struct point_pairs *pairs)
{
f64 sum = 0;
for (int i = 0; i < pairs->count; i++) {
struct point_pair *p = &pairs->pairs[i];
f64 lat1 = p->y0;
f64 lat2 = p->y1;
f64 lon1 = p->x0;
f64 lon2 = p->x1;
f64 dLat = radians_to_degrees(lat2 - lat1);
f64 dLon = radians_to_degrees(lon2 - lon1);
lat1 = radians_to_degrees(lat1);
lat2 = radians_to_degrees(lat2);
f64 a = sqaure_f64(my_sin(dLat/2.0)) + my_cos(lat1)*my_cos(lat2)*sqaure_f64(my_sin(dLon/2));
sum += my_asin(my_sqrt(a));
}
sum *= 2.0*EARTH_RADIUS;
return sum;
}
static f64 sum_harvensines_c(struct point_pairs *pairs)
{
f64 sum = 0;
for (int i = 0; i < pairs->count; i++) {
struct point_pair *p = &pairs->pairs[i];
f64 lat1 = p->y0;
f64 lat2 = p->y1;
f64 lon1 = p->x0;
f64 lon2 = p->x1;
f64 dLat = radians_to_degrees(lat2 - lat1);
f64 dLon = radians_to_degrees(lon2 - lon1);
lat1 = radians_to_degrees(lat1);
lat2 = radians_to_degrees(lat2);
f64 a = sqaure_f64(my_sin(dLat/2.0)) + my_cos(lat1)*my_cos(lat2)*sqaure_f64(my_sin(dLon/2.0));
sum += my_asin(my_sqrt(a));
}
sum *= 2.0*EARTH_RADIUS;
return sum;
}
int main(int argc, char **argv)
{
rprof_init();
@ -167,23 +228,11 @@ int main(int argc, char **argv)
// Step 3. Calculate harvensine distances
RPROF_START_BYTES("Compute harvensines", sizeof(struct point_pair)*pairs->count);
f64 *harvensines = malloc(pairs->count*sizeof(f64));
for (int i = 0; i < pairs->count; i++) {
struct point_pair *p = &pairs->pairs[i];
harvensines[i] = get_harvensine_distance(p->x0, p->y0, p->x1, p->y1, EARTH_RADIUS);
}
RPROF_STOP();
RPROF_START_BYTES("Sum harvensines", sizeof(f64)*pairs->count);
f64 harvensine_sum = 0;
for (int i = 0; i < pairs->count; i++) {
harvensine_sum += harvensines[i];
}
f64 harvensine_sum = sum_harvensines_c(pairs);
RPROF_STOP();
RPROF_START_BYTES("Free struct memory", sizeof(f64)*pairs->count + sizeof(struct point_pair)*pairs->count);
free(reference_harvensines);
free(harvensines);
RPROF_STOP();
RPROF_START("Free json memory");
free_json_value(parsed);
@ -206,23 +255,5 @@ int main(int argc, char **argv)
rprof_output(rprof_cmp_by_exclusive_duration);
printf("\n------- Counters ------\n");
printf("Sin:\n");
printf("Input: %f - %f\n", sin_counters.min_input, sin_counters.max_input);
printf("Output: %f - %f\n\n", sin_counters.min_output, sin_counters.max_output);
printf("Cos:\n");
printf("Input: %f - %f\n", cos_counters.min_input, cos_counters.max_input);
printf("Output: %f - %f\n\n", cos_counters.min_output, cos_counters.max_output);
printf("Sqrt:\n");
printf("Input: %f - %f\n", sqrt_counters.min_input, sqrt_counters.max_input);
printf("Output: %f - %f\n\n", sqrt_counters.min_output, sqrt_counters.max_output);
printf("Asin:\n");
printf("Input: %f - %f\n", asin_counters.min_input, asin_counters.max_input);
printf("Output: %f - %f\n", asin_counters.min_output, asin_counters.max_output);
return 0;
}

View File

@ -2,7 +2,13 @@
#include <inttypes.h>
#include <stdio.h>
#include <immintrin.h>
#include <time.h>
#include "./harvensine_formula.c"
typedef uint8_t u8;
typedef uint16_t u16;
typedef uint32_t u32;
typedef double f64;
typedef float f32;
typedef uint64_t u64;
@ -22,12 +28,95 @@ struct math_spec {
f64 to;
};
// From: https://github.com/cmuratori/computer_enhance/blob/main/perfaware/part4/listing_0184_sine_coefficients.inl
static f64 sine_optimized_coeffs[][11] =
{
// NOTE(casey): This minimax coefficient table was donated by Demetri Spanos
{},
{},
{0x1.fc4eac57b4a27p-1, -0x1.2b704cf682899p-3},
{0x1.fff1d21fa9dedp-1, -0x1.53e2e5c7dd831p-3, 0x1.f2438d36d9dbbp-8},
{0x1.ffffe07d31fe8p-1, -0x1.554f800fc5ea1p-3, 0x1.105d44e6222ap-7, -0x1.83b9725dff6e8p-13},
{0x1.ffffffd25a681p-1, -0x1.555547ef5150bp-3, 0x1.110e7b396c557p-7, -0x1.9f6445023f795p-13, 0x1.5d38b56aee7f1p-19},
{0x1.ffffffffd17d1p-1, -0x1.55555541759fap-3, 0x1.11110b74adb14p-7, -0x1.a017a8fe15033p-13, 0x1.716ba4fe56f6ep-19, -0x1.9a0e192a4e2cbp-26},
{0x1.ffffffffffdcep-1, -0x1.5555555540b9bp-3, 0x1.111111090f0bcp-7, -0x1.a019fce979937p-13, 0x1.71dce5ace58d2p-19, -0x1.ae00fd733fe8dp-26, 0x1.52ace959bd023p-33},
{0x1.fffffffffffffp-1, -0x1.5555555555469p-3, 0x1.111111110941dp-7, -0x1.a01a0199e0eb3p-13, 0x1.71de37e62aacap-19, -0x1.ae634d22bb47cp-26, 0x1.60e59ae00e00cp-33, -0x1.9ef5d594b342p-41},
{0x1p0, -0x1.5555555555555p-3, 0x1.11111111110c9p-7, -0x1.a01a01a014eb6p-13, 0x1.71de3a52aab96p-19, -0x1.ae6454d960ac4p-26, 0x1.6123ce513b09fp-33, -0x1.ae43dc9bf8ba7p-41, 0x1.883c1c5deffbep-49},
{0x1p0, -0x1.5555555555555p-3, 0x1.11111111110dcp-7, -0x1.a01a01a016ef6p-13, 0x1.71de3a53fa85cp-19, -0x1.ae6455b871494p-26, 0x1.612421756f93fp-33, -0x1.ae671378c3d43p-41, 0x1.90277dafc8ab9p-49, -0x1.78262e1f2709cp-58},
{0x1p0, -0x1.5555555555555p-3, 0x1.11111111110dp-7, -0x1.a01a01a01559ap-13, 0x1.71de3a52ad36dp-19, -0x1.ae64549aa7ca9p-26, 0x1.612392f66fdcdp-33, -0x1.ae11556cad6c4p-41, 0x1.71744c339ad03p-49, 0x1.52947c90f8199p-55, -0x1.ff1898c107cfap-59},
};
// From: https://github.com/cmuratori/computer_enhance/blob/main/perfaware/part4/listing_0187_arcsine_coefficients.inl
static f64 arcsine_taylor_coeffs[] =
{
1.0,
0.1666666666666666666666666666666666666666666666666666666666666666666666666666667,
0.075,
0.04464285714285714285714285714285714285714285714285714285714285714285714285714286,
0.03038194444444444444444444444444444444444444444444444444444444444444444444444444,
0.02237215909090909090909090909090909090909090909090909090909090909090909090909091,
0.01735276442307692307692307692307692307692307692307692307692307692307692307692308,
0.01396484375,
0.01155180089613970588235294117647058823529411764705882352941176470588235294117647,
0.009761609529194078947368421052631578947368421052631578947368421052631578947368421,
0.008390335809616815476190476190476190476190476190476190476190476190476190476190476,
0.007312525873598845108695652173913043478260869565217391304347826086956521739130435,
0.0064472103118896484375,
0.005740037670841923466435185185185185185185185185185185185185185185185185185185185,
0.005153309682319904195851293103448275862068965517241379310344827586206896551724138,
0.004660143486915096159904233870967741935483870967741935483870967741935483870967742,
0.004240907093679363077337091619318181818181818181818181818181818181818181818181818,
0.003880964558837669236319405691964285714285714285714285714285714285714285714285714,
0.003569205393825934545413867847339527027027027027027027027027027027027027027027027,
0.003297059503473484745392432579627403846153846153846153846153846153846153846153846,
0.003057821649258030669354810947325171493902439024390243902439024390243902439024390,
0.002846178401108942167876764785411746002906976744186046511627906976744186046511628,
0.002657870638207289933537443478902180989583333333333333333333333333333333333333333,
0.002489448678246883494640759965206714386635638297872340425531914893617021276595745,
0.002338091892111975186930883827866340170101243622448979591836734693877551020408163,
0.002201473973710138205420020419885130489573759191176470588235294117647058823529412,
0.002077661032518167442778473553019312192808906987028301886792452830188679245283019,
0.001965033616277283618439303774555975740606134588068181818181818181818181818181818,
0.001862226406403127489279102104646562222848858749657346491228070175438596491228070,
};
// From: https://github.com/cmuratori/computer_enhance/blob/main/perfaware/part4/listing_0187_arcsine_coefficients.inl
static f64 arsine_minmax_coeffs[][22] =
{
// NOTE(casey): This minimax coefficient table was donated by Demetri Spanos
{},
{},
{0x1.fdfcefbdd3154p-1 , 0x1.c427597754a37p-3},
{0x1.0019e5b9a7693p0, 0x1.3b5f83d579a47p-3, 0x1.0da162d6fae3dp-3},
{0x1.fffa004bed736p-1, 0x1.5acaca323d3aep-3, 0x1.a52ade47d967dp-5, 0x1.b0931e5a07f25p-4},
{0x1.0000609783343p0, 0x1.543eb056cd449p-3, 0x1.52db50c86c17fp-4, 0x1.c36707c70d21cp-8, 0x1.8faf3815344ddp-4},
{0x1.ffffe6586d628p-1, 0x1.558b2dc0be61cp-3, 0x1.2a2202ec5cb8p-4, 0x1.f96fb970de571p-5, -0x1.ac22c3939a9a9p-6, 0x1.912219085f248p-4},
{0x1.000001c517503p0, 0x1.554b23dabce0bp-3, 0x1.35948cff7046bp-4, 0x1.391ca703d0d07p-5, 0x1.03149c11e9277p-4, -0x1.e523c15fbf438p-5, 0x1.a906b64a9bdc7p-4},
{0x1.ffffff7f5bbbcp-1, 0x1.55573c38fd397p-3, 0x1.329cc1329ab48p-4, 0x1.7f40be8459b49p-5, 0x1.ea3ebd68f4abp-7, 0x1.4b1dad0e7b7e5p-4, -0x1.912818d0401bfp-4, 0x1.d3ec796e5ec8bp-4},
{0x1.00000009557b8p0, 0x1.5554fb74b4bffp-3, 0x1.3356afe11c66p-4, 0x1.685b6636af595p-5, 0x1.2c25059387c7ep-5, -0x1.5b983df09138p-7, 0x1.db45568eb9217p-4, -0x1.2c7c4453a9b3ep-3, 0x1.0903eea6d1357p-3},
{0x1.fffffffd3e442p-1, 0x1.555565c9beb43p-3, 0x1.332b1eab3d6f2p-4, 0x1.6f3ed264eef28p-5, 0x1.cc5fc8ed87fdbp-6, 0x1.38c39ea555adep-5, -0x1.88322c8ce661fp-5, 0x1.656a2ea43451dp-3, -0x1.aed7e0dfdbd27p-3, 0x1.32de0e0b3820fp-3},
{0x1.0000000034db9p0, 0x1.5555525723f64p-3, 0x1.3334fd1dd69f5p-4, 0x1.6d4c8c3659p-5, 0x1.fe5b240c320ebp-6, 0x1.0076fe3314273p-6, 0x1.b627b3be92bd4p-5, -0x1.ba657aa72abeep-4, 0x1.103aa8bb00a4ep-2, -0x1.2deb335977b56p-2, 0x1.699a7715830d2p-3},
{0x1.ffffffffeff9dp-1, 0x1.555555dff5e06p-3, 0x1.3332d0221f548p-4, 0x1.6dd27e8c33d52p-5, 0x1.edd05e3dff008p-6, 0x1.992d0b8b03f01p-6, -0x1.ac779f1be0507p-13, 0x1.73bb5b359003ap-4, -0x1.a8326f2354f8ap-3, 0x1.9e1b8885f9661p-2, -0x1.a1b0aa236a282p-2, 0x1.b038b25a40e08p-3},
{0x1.00000000013ap0, 0x1.5555553c5c8a7p-3, 0x1.33334839e1acap-4, 0x1.6dafeb7453ee6p-5, 0x1.f2f65baf85a8cp-6, 0x1.5f396c79d5687p-6, 0x1.9a8031b47fd85p-6, -0x1.cbd84d319158p-6, 0x1.53df7e2c17602p-3, -0x1.7a954b7cb46e6p-2, 0x1.38e97b1392a69p-1, -0x1.1eabdc3fe561ap-1, 0x1.056424720e768p-2},
{0x1.ffffffffff9fp-1, 0x1.55555559d0d4p-3, 0x1.33332ecf01c13p-4, 0x1.6db88c4cfe8eap-5, 0x1.f17068ec7ac68p-6, 0x1.73b9408ccb9b1p-6, 0x1.d2a82629eb78ep-7, 0x1.1b4dda11bb1d2p-5, -0x1.5210c527bd7ep-4, 0x1.3a638b5965e45p-2, -0x1.4434b98838c1dp-1, 0x1.d52ccc09ba2cdp-1, -0x1.8792b45ef365ep-1, 0x1.3f5545e9e11eap-2},
{0x1.0000000000079p0, 0x1.5555555487dd3p-3, 0x1.3333341adb0b8p-4, 0x1.6db67483a8f77p-5, 0x1.f1defdcf41a11p-6, 0x1.6ce213041c326p-6, 0x1.2f8bd23b33763p-6, 0x1.34a6d9f27428dp-8, 0x1.007f36ef69d66p-4, -0x1.850e0d65729e1p-3, 0x1.1f42350f23ccep-1, -0x1.0e0b5512f8d35p0, 0x1.5d065bf34c03ep0, -0x1.0a98c5604a5c6p0, 0x1.8978c6502660ap-2},
{0x1.fffffffffffdap-1, 0x1.555555557a085p-3, 0x1.33333304070d3p-4, 0x1.6db6f35f4ac13p-5, 0x1.f1c0bdf8248f6p-6, 0x1.6f0e61397193p-6, 0x1.15740f26a5e24p-6, 0x1.24069344266aap-6, -0x1.c02ef74c5e655p-7, 0x1.07833aeac1562p-3, -0x1.97487ee8ceb5p-2, 0x1.0178f7f5c01bdp0, -0x1.b8b2ea879a2a5p0, 0x1.01ccfbe6e1f6ap1, -0x1.6a46d11c16386p0, 0x1.e86a774524862p-2},
{0x1.0000000000003p0, 0x1.555555554ecb4p-3, 0x1.3333333cb4a27p-4, 0x1.6db6d5f669d29p-5, 0x1.f1c8c3485860bp-6, 0x1.6e64f7828f426p-6, 0x1.1ea1dc340da9p-6, 0x1.98123c756ff58p-7, 0x1.7a0c83f514b22p-6, -0x1.bd6eb7cdaf8e4p-5, 0x1.162743c14bf13p-2, -0x1.944c737b04ef5p-1, 0x1.c47bd23ee68a2p0, -0x1.61d1590acbbfp1, 0x1.7a6e0b194804dp1, -0x1.eba481b8f24dfp0, 0x1.311805d4c6d33p-1},
{0x1.fffffffffffffp-1, 0x1.555555555683fp-3, 0x1.3333333148aa7p-4, 0x1.6db6dca9f82d4p-5, 0x1.f1c6b0ea300d7p-6, 0x1.6e96be6dbe49ep-6, 0x1.1b8cc838ee86ep-6, 0x1.dc086c5d99cdcp-7, 0x1.b1b8d27cd7e72p-8, 0x1.5565a3d3908b9p-5, -0x1.2ab04ba9012e3p-3, 0x1.224c4dbe13cbdp-1, -0x1.83633c76e4551p0, 0x1.86bbff2a6c7b6p1, -0x1.188f223fe5f34p2, 0x1.14672d35db97ep2, -0x1.4d84801ff1aa1p1, 0x1.7f820d52c2775p-1},
{0x1p0, 0x1.555555555531ep-3, 0x1.3333333380df2p-4, 0x1.6db6db3184756p-5, 0x1.f1c73443a02f5p-6, 0x1.6e88ce94d1149p-6, 0x1.1c875d6c5323dp-6, 0x1.c37061f4e5f55p-7, 0x1.b8a33b8e380efp-7, -0x1.21438ccc95d62p-8, 0x1.69b370aad086ep-4, -0x1.57380bcd2890ep-2, 0x1.1fb54da575b22p0, -0x1.6067d334b4792p1, 0x1.4537ddde2d76dp2, -0x1.b06f523e74f33p2, 0x1.8bf4dadaf548cp2, -0x1.bec6daf74ed61p1, 0x1.dfc53682725cap-1},
{0x1p0, 0x1.55555555555bap-3, 0x1.3333333323ebcp-4, 0x1.6db6db7adc18bp-5, 0x1.f1c716a8f3363p-6, 0x1.6e8c66fac48d5p-6, 0x1.1c3da3ac97e63p-6, 0x1.cbb180b74d85dp-7, 0x1.62b81445afbfdp-7, 0x1.050a65cdec399p-6, -0x1.018ae6d82506cp-5, 0x1.a361973086e84p-3, -0x1.7f8907c1978c3p-1, 0x1.1debe7d3f064p1, -0x1.411c99c675e12p2, 0x1.106a078008a9ap3, -0x1.500975aa37fb8p3, 0x1.1ea75d01d0debp3, -0x1.2ee507d6a1a5fp2, 0x1.3070aa6a5b88ep0},
{0x1p0, 0x1.5555555555544p-3, 0x1.3333333336209p-4, 0x1.6db6db6aeb726p-5, 0x1.f1c71dcf049c4p-6, 0x1.6e8b6f8df785cp-6, 0x1.1c53c3234c54p-6, 0x1.c8eb3e8133ceap-7, 0x1.8335ee4136147p-7, 0x1.d9a5ff05f747ep-8, 0x1.b949ad43fb2bdp-6, -0x1.9080df821c302p-4, 0x1.e245cd46c886cp-2, -0x1.99434e2a3223ap0, 0x1.147d4d3b7ec76p2, -0x1.1e2a8ce097204p3, 0x1.c17aa6abf54eap3, -0x1.02778b2d86e57p4, 0x1.9ccd7e4c0706bp3, -0x1.9a13424bd53c2p2, 0x1.837ec3890fee1p0},
{0x1p0, 0x1.5555555555558p-3, 0x1.3333333332aedp-4, 0x1.6db6db6e45234p-5, 0x1.f1c71c24301p-6, 0x1.6e8baf9ddc763p-6, 0x1.1c4d64d353371p-6, 0x1.c9cf1f8de89e6p-7, 0x1.778d723247697p-7, 0x1.5fcac651d07d4p-7, 0x1.799c2f33c0274p-12, 0x1.e288894a8bc33p-5, -0x1.0446ef7fdb149p-2, 0x1.0ba0fa7048fb2p0, -0x1.a273e0e74ee85p1, 0x1.034f776a3db58p3, -0x1.f1adf47b08719p3, 0x1.6c271c319b92ap4, -0x1.886f83ada1ccfp4, 0x1.26c247c3a321bp4, -0x1.146482ddd5f29p3, 0x1.ed3ada8793e41p0},
};
static f64 square(f64 num)
{
return num * num;
}
static f64 my_sin(f64 num)
static f64 my_sin_approx(f64 num)
{
f64 sign = 1;
if (num < 0) {
@ -91,7 +180,7 @@ static f64 my_cos_using_sin(f64 num)
return my_sin_quarter_fitted(num + M_PI/2);
}
static f64 my_cos(f64 num)
static f64 my_cos_approx(f64 num)
{
if (num < 0) {
num *= -1;
@ -119,11 +208,6 @@ static f64 my_sqrt2(f64 num)
return result;
}
static f64 my_asin(f64 num)
{
return num;
}
static f64 factorial(size_t n)
{
f64 result = 1;
@ -144,7 +228,7 @@ static void create_taylor_series_coeffs(f64 *coeffs, size_t coeffs_len)
}
}
static f64 sin_using_taylor(f64 input, f64 *coeffs, size_t coeffs_len)
static f64 compute_taylor_series(f64 input, f64 *coeffs, size_t coeffs_len)
{
f64 result = 0;
for (size_t i = 0; i < coeffs_len; i++) {
@ -154,7 +238,7 @@ static f64 sin_using_taylor(f64 input, f64 *coeffs, size_t coeffs_len)
return result;
}
static f64 sin_using_taylor_horner(f64 input, f64 *coeffs, size_t coeffs_len)
static f64 compute_taylor_series_horner(f64 input, f64 *coeffs, size_t coeffs_len)
{
f64 input_sqr = input * input;
@ -171,6 +255,88 @@ static f64 sin_using_taylor_horner(f64 input, f64 *coeffs, size_t coeffs_len)
return result;
}
static f64 compute_taylor_series_horner_fma(f64 input, f64 *coeffs, size_t coeffs_len)
{
f64 input_sqr = input * input;
f64 result = coeffs[coeffs_len - 1];
for (size_t j = 1; j < coeffs_len; j++) {
size_t i = coeffs_len - j - 1;
result = fma(result, input_sqr, coeffs[i]);
}
result *= input;
return result;
}
static f64 sin_best(f64 input)
{
f64 sign = 1;
if (input < 0) {
sign *= -1;
input = -input;
}
if (input > M_PI / 2) {
input = M_PI - input;
}
f64 input_sqr = input * input;
f64 result = 0x1.883c1c5deffbep-49;
result = fma(result, input_sqr, -0x1.ae43dc9bf8ba7p-41);
result = fma(result, input_sqr, 0x1.6123ce513b09fp-33);
result = fma(result, input_sqr, -0x1.ae6454d960ac4p-26);
result = fma(result, input_sqr, 0x1.71de3a52aab96p-19);
result = fma(result, input_sqr, -0x1.a01a01a014eb6p-13);
result = fma(result, input_sqr, 0x1.11111111110c9p-7);
result = fma(result, input_sqr, -0x1.5555555555555p-3);
result = fma(result, input_sqr, 0x1p0);
result *= input;
return result * sign;
}
static f64 asin_best(f64 input)
{
f64 scalar = 1;
f64 offset = 0;
f64 one_over_sqrt_2 = 1/sqrt(2);
if (input > one_over_sqrt_2) {
input = my_sqrt1(1 - input * input);
scalar = -1;
offset = M_PI/2;
}
f64 input_sqr = input * input;
f64 result = 0x1.3070aa6a5b88ep0;
result = fma(result, input_sqr, -0x1.2ee507d6a1a5fp2);
result = fma(result, input_sqr, 0x1.1ea75d01d0debp3);
result = fma(result, input_sqr, -0x1.500975aa37fb8p3);
result = fma(result, input_sqr, 0x1.106a078008a9ap3);
result = fma(result, input_sqr, -0x1.411c99c675e12p2);
result = fma(result, input_sqr, 0x1.1debe7d3f064p1);
result = fma(result, input_sqr, -0x1.7f8907c1978c3p-1);
result = fma(result, input_sqr, 0x1.a361973086e84p-3);
result = fma(result, input_sqr, -0x1.018ae6d82506cp-5);
result = fma(result, input_sqr, 0x1.050a65cdec399p-6);
result = fma(result, input_sqr, 0x1.62b81445afbfdp-7);
result = fma(result, input_sqr, 0x1.cbb180b74d85dp-7);
result = fma(result, input_sqr, 0x1.1c3da3ac97e63p-6);
result = fma(result, input_sqr, 0x1.6e8c66fac48d5p-6);
result = fma(result, input_sqr, 0x1.f1c716a8f3363p-6);
result = fma(result, input_sqr, 0x1.6db6db7adc18bp-5);
result = fma(result, input_sqr, 0x1.3333333323ebcp-4);
result = fma(result, input_sqr, 0x1.55555555555bap-3);
result = fma(result, input_sqr, 0x1p0);
result *= input;
return fma(result, scalar, offset);
}
static void tester_record_result(struct tester *tester, f64 input, f64 my_output, f64 reference_output)
{
f64 error = reference_output - my_output;
@ -186,7 +352,7 @@ static void tester_record_result(struct tester *tester, f64 input, f64 my_output
static void tester_show_result(struct tester *tester, const char *name)
{
printf("%20s max error %.16f at %f\n", name, tester->max_error, tester->max_error_at);
printf("%20s max error %.24f at %f\n", name, tester->max_error, tester->max_error_at);
}
static f64 lerp(f64 x, f64 from, f64 to)
@ -200,11 +366,13 @@ static void test_taylor_series(u64 samples, f64 *coeffs, size_t coeffs_len)
struct tester tester = { };
struct tester tester_horner = { };
struct tester tester_horner_fma = { };
for (int j = 0; j < samples; j++) {
f64 input = lerp((f64)j / samples, 0, M_PI/2);
tester_record_result(&tester, input, sin_using_taylor(input, coeffs, coeffs_len), sin(input));
tester_record_result(&tester_horner, input, sin_using_taylor_horner(input, coeffs, coeffs_len), sin(input));
tester_record_result(&tester, input, compute_taylor_series(input, coeffs, coeffs_len), sin(input));
tester_record_result(&tester_horner, input, compute_taylor_series_horner(input, coeffs, coeffs_len), sin(input));
tester_record_result(&tester_horner_fma, input, compute_taylor_series_horner_fma(input, coeffs, coeffs_len), sin(input));
}
{
@ -219,16 +387,61 @@ static void test_taylor_series(u64 samples, f64 *coeffs, size_t coeffs_len)
tester_show_result(&tester_horner, name);
}
printf(" diff %.16f\n", tester.max_error - tester_horner.max_error);
{
char name[64] = { 0 };
snprintf(name, sizeof(name), "taylor f %zu", coeffs_len);
tester_show_result(&tester_horner, name);
}
printf(" diff normal - horner %.24f\n", tester.max_error - tester_horner.max_error);
printf(" diff horner - fused %.24f\n", tester_horner.max_error - tester_horner_fma.max_error);
}
int main()
static u8 rand_u8() {
return rand() & 0xFF;
}
static u16 rand_u16() {
return rand_u8() | (rand_u8() << 8);
}
static u32 rand_u32() {
return rand_u16() | (rand_u16() << 16);
}
// https://stackoverflow.com/a/55766267
static f64 rand_f64() {
u64 r53 = ((uint64_t)(rand_u32()) << 21) ^ (rand_u8() >> 2);
return (double)r53 / 9007199254740991.0; // 2^53 - 1
}
static f64 rand_f64_range(f64 from, f64 to) {
return rand_f64() * (to - from) + from;
}
static f64 get_harvensine_distance_reference(f64 X0, f64 Y0, f64 X1, f64 Y1, f64 earth_radius)
{
f64 lat1 = Y0;
f64 lat2 = Y1;
f64 lon1 = X0;
f64 lon2 = X1;
f64 dLat = radians_to_degrees(lat2 - lat1);
f64 dLon = radians_to_degrees(lon2 - lon1);
lat1 = radians_to_degrees(lat1);
lat2 = radians_to_degrees(lat2);
f64 a = sqaure_f64(sin(dLat/2.0)) + cos(lat1)*cos(lat2)*sqaure_f64(sin(dLon/2));
f64 c = 2.0*asin(sqrt(a));
return earth_radius * c;
}
static void sin_tests()
{
uint64_t samples = 100000;
struct math_spec specs[] = {
{
.name = "sin",
.mine = my_sin,
.mine = my_sin_approx,
.reference = sin,
.from = -M_PI,
.to = M_PI
@ -254,9 +467,16 @@ int main()
.from = -M_PI,
.to = M_PI
},
{
.name = "sin_best",
.mine = sin_best,
.reference = sin,
.from = -M_PI,
.to = M_PI
},
{
.name = "cos",
.mine = my_cos,
.mine = my_cos_approx,
.reference = cos,
.from = -M_PI/2,
.to = M_PI/2
@ -274,13 +494,6 @@ int main()
.reference = sqrt,
.from = 0,
.to = 1,
},
{
.name = "asin",
.mine = my_asin,
.reference = asin,
.from = 0,
.to = 1,
}
};
@ -297,10 +510,113 @@ int main()
tester_show_result(&tester, spec.name);
}
f64 taylor_coeffs[10];
for (int i = 2; i < ARRAY_LEN(sine_optimized_coeffs); i++) {
struct tester tester = { };
f64 *coeffs = sine_optimized_coeffs[i];
size_t coeffs_len = i;
for (int j = 0; j < samples; j++) {
f64 input = lerp((f64)j / samples, 0, M_PI/2);
tester_record_result(&tester, input, compute_taylor_series_horner_fma(input, coeffs, coeffs_len), sin(input));
}
char name[64] = { 0 };
snprintf(name, sizeof(name), "optimized taylor %zu", coeffs_len);
tester_show_result(&tester, name);
}
f64 taylor_coeffs[13];
for (int i = 1; i < ARRAY_LEN(taylor_coeffs); i++) {
test_taylor_series(samples, taylor_coeffs, i);
}
}
static void arcsin_tests()
{
uint64_t samples = 100000;
f64 from = 0;
f64 to = 1/sqrt(2);
for (int i = 0; i < ARRAY_LEN(arcsine_taylor_coeffs) - 1; i++) {
f64 *coeffs = arcsine_taylor_coeffs;
size_t coeffs_len = i + 1;
struct tester tester = { };
for (int j = 0; j < samples; j++) {
f64 input = lerp((f64)j / samples, from, to);
tester_record_result(&tester, input, compute_taylor_series_horner_fma(input, coeffs, coeffs_len), asin(input));
}
char name[64] = { 0 };
snprintf(name, sizeof(name), "taylor %zu", coeffs_len);
tester_show_result(&tester, name);
}
for (int i = 2; i < ARRAY_LEN(arsine_minmax_coeffs) - 1; i++) {
f64 *coeffs = arsine_minmax_coeffs[i];
size_t coeffs_len = i + 1;
struct tester tester = { };
for (int j = 0; j < samples; j++) {
f64 input = lerp((f64)j / samples, from, to);
tester_record_result(&tester, input, compute_taylor_series_horner_fma(input, coeffs, coeffs_len), asin(input));
}
char name[64] = { 0 };
snprintf(name, sizeof(name), "minmax %zu", coeffs_len);
tester_show_result(&tester, name);
}
{
struct tester tester = { };
for (int j = 0; j < samples; j++) {
f64 input = lerp((f64)j / samples, 0, 1);
tester_record_result(&tester, input, asin_best(input), asin(input));
}
tester_show_result(&tester, "best");
}
}
static void harvensine_tests()
{
uint64_t samples = 100000;
f64 max_difference = 0;
u32 seed = time(NULL);
srand(seed);
for (int i = 0; i < samples; i++) {
f64 x0 = rand_f64_range(-180, 180);
f64 y0 = rand_f64_range(-90, 90);
f64 x1 = rand_f64_range(-180, 180);
f64 y1 = rand_f64_range(-90, 90);
f64 expected = get_harvensine_distance_reference(x0, y0, x1, y1, EARTH_RADIUS);
f64 actual = get_harvensine_distance(x0, y0, x1, y1, EARTH_RADIUS);
max_difference = MAX(max_difference, expected - actual);
}
printf("reference vs actual, max error: %.17f\n", max_difference);
}
int main()
{
// printf("---------- sin() -------------\n");
// sin_tests();
// printf("--------- asin() -------------\n");
// arcsin_tests();
// printf("%.24f\n", 1.0 / sqrt(2));
printf("--------- haversine() -------------\n");
harvensine_tests();
return 0;
}