diff --git a/Makefile b/Makefile index 13ee15a..529e89d 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -CFLAGS=-lm -g -Wall -O3 -mavx2 +CFLAGS=-lm -g -Wall -O2 -mavx2 -mfma .PHONY := gen_data main diff --git a/src/harvensine_formula.c b/src/harvensine_formula.c index 9be1630..ad00a70 100644 --- a/src/harvensine_formula.c +++ b/src/harvensine_formula.c @@ -1,62 +1,94 @@ #include #include #include +#include typedef double f64; #define EARTH_RADIUS 6372.8 -struct counters { - f64 min_input; - f64 max_input; - f64 min_output; - f64 max_output; -}; - -#define EMPTY_COUNTERS (struct counters){ .min_input = DBL_MAX, .max_input = -DBL_MAX, .min_output = DBL_MAX, .max_output = -DBL_MAX } - -struct counters cos_counters = EMPTY_COUNTERS; -struct counters sin_counters = EMPTY_COUNTERS; -struct counters asin_counters = EMPTY_COUNTERS; -struct counters sqrt_counters = EMPTY_COUNTERS; - -#define LOG_COUNTERS(counters, func, a) do { \ -countres.min_input = MIN(countres.min_input, a) \ -} while (0) - -static f64 log_counters(struct counters *counters, f64 (*calc)(f64), f64 input) -{ - counters->min_input = MIN(counters->min_input, input); - counters->max_input = MAX(counters->max_input, input); - f64 output = calc(input); - counters->min_output = MIN(counters->min_output, output); - counters->max_output = MAX(counters->max_output, output); - return output; -} - static f64 sqaure_f64(f64 num) { return num*num; } -static f64 my_sin(f64 num) +static f64 my_sin(f64 input) { - return log_counters(&sin_counters, sin, num); + f64 sign = 1; + if (input < 0) { + sign *= -1; + input = -input; + } + + if (input > M_PI / 2) { + input = M_PI - input; + } + + f64 input_sqr = input * input; + + f64 result = 0x1.883c1c5deffbep-49; + result = fma(result, input_sqr, -0x1.ae43dc9bf8ba7p-41); + result = fma(result, input_sqr, 0x1.6123ce513b09fp-33); + result = fma(result, input_sqr, -0x1.ae6454d960ac4p-26); + result = fma(result, input_sqr, 0x1.71de3a52aab96p-19); + result = fma(result, input_sqr, -0x1.a01a01a014eb6p-13); + result = fma(result, input_sqr, 0x1.11111111110c9p-7); + result = fma(result, input_sqr, -0x1.5555555555555p-3); + result = fma(result, input_sqr, 0x1p0); + result *= input; + + return result * sign; } -static f64 my_cos(f64 num) +static f64 my_cos(f64 input) { - return log_counters(&cos_counters, cos, num); + return my_sin(input + M_PI/2); } static f64 my_sqrt(f64 num) { - return log_counters(&sqrt_counters, sqrt, num); + __m128d mm_num = _mm_set_sd(num); + __m128d mm_result = _mm_sqrt_sd(mm_num, mm_num); + f64 result = _mm_cvtsd_f64(mm_result); + return result; } -static f64 my_asin(f64 num) +static f64 my_asin(f64 input) { - return log_counters(&asin_counters, asin, num); + f64 scalar = 1; + f64 offset = 0; + f64 one_over_sqrt_2 = 0.707106781186547461715008; // 1/sqrt(2); + if (input > one_over_sqrt_2) { + input = my_sqrt(1 - input * input); + scalar = -1; + offset = M_PI/2; + } + + f64 input_sqr = input * input; + + f64 result = 0x1.3070aa6a5b88ep0; + result = fma(result, input_sqr, -0x1.2ee507d6a1a5fp2); + result = fma(result, input_sqr, 0x1.1ea75d01d0debp3); + result = fma(result, input_sqr, -0x1.500975aa37fb8p3); + result = fma(result, input_sqr, 0x1.106a078008a9ap3); + result = fma(result, input_sqr, -0x1.411c99c675e12p2); + result = fma(result, input_sqr, 0x1.1debe7d3f064p1); + result = fma(result, input_sqr, -0x1.7f8907c1978c3p-1); + result = fma(result, input_sqr, 0x1.a361973086e84p-3); + result = fma(result, input_sqr, -0x1.018ae6d82506cp-5); + result = fma(result, input_sqr, 0x1.050a65cdec399p-6); + result = fma(result, input_sqr, 0x1.62b81445afbfdp-7); + result = fma(result, input_sqr, 0x1.cbb180b74d85dp-7); + result = fma(result, input_sqr, 0x1.1c3da3ac97e63p-6); + result = fma(result, input_sqr, 0x1.6e8c66fac48d5p-6); + result = fma(result, input_sqr, 0x1.f1c716a8f3363p-6); + result = fma(result, input_sqr, 0x1.6db6db7adc18bp-5); + result = fma(result, input_sqr, 0x1.3333333323ebcp-4); + result = fma(result, input_sqr, 0x1.55555555555bap-3); + result = fma(result, input_sqr, 0x1p0); + result *= input; + + return fma(result, scalar, offset); } static f64 radians_to_degrees(f64 degrees) diff --git a/src/main.c b/src/main.c index c3816f8..84a1765 100644 --- a/src/main.c +++ b/src/main.c @@ -89,6 +89,67 @@ static void free_point_pairs(struct point_pairs *pairs) free(pairs); } +static f64 sum_harvensines_a(struct point_pairs *pairs) +{ + f64 sum = 0; + for (int i = 0; i < pairs->count; i++) { + struct point_pair *p = &pairs->pairs[i]; + sum += get_harvensine_distance(p->x0, p->y0, p->x1, p->y1, EARTH_RADIUS); + } + return sum; +} + +static f64 sum_harvensines_b(struct point_pairs *pairs) +{ + f64 sum = 0; + for (int i = 0; i < pairs->count; i++) { + struct point_pair *p = &pairs->pairs[i]; + + f64 lat1 = p->y0; + f64 lat2 = p->y1; + f64 lon1 = p->x0; + f64 lon2 = p->x1; + + f64 dLat = radians_to_degrees(lat2 - lat1); + f64 dLon = radians_to_degrees(lon2 - lon1); + lat1 = radians_to_degrees(lat1); + lat2 = radians_to_degrees(lat2); + + f64 a = sqaure_f64(my_sin(dLat/2.0)) + my_cos(lat1)*my_cos(lat2)*sqaure_f64(my_sin(dLon/2)); + + sum += my_asin(my_sqrt(a)); + } + sum *= 2.0*EARTH_RADIUS; + + return sum; +} + +static f64 sum_harvensines_c(struct point_pairs *pairs) +{ + f64 sum = 0; + for (int i = 0; i < pairs->count; i++) { + struct point_pair *p = &pairs->pairs[i]; + + f64 lat1 = p->y0; + f64 lat2 = p->y1; + f64 lon1 = p->x0; + f64 lon2 = p->x1; + + f64 dLat = radians_to_degrees(lat2 - lat1); + f64 dLon = radians_to_degrees(lon2 - lon1); + lat1 = radians_to_degrees(lat1); + lat2 = radians_to_degrees(lat2); + + f64 a = sqaure_f64(my_sin(dLat/2.0)) + my_cos(lat1)*my_cos(lat2)*sqaure_f64(my_sin(dLon/2.0)); + + sum += my_asin(my_sqrt(a)); + } + + sum *= 2.0*EARTH_RADIUS; + + return sum; +} + int main(int argc, char **argv) { rprof_init(); @@ -167,23 +228,11 @@ int main(int argc, char **argv) // Step 3. Calculate harvensine distances RPROF_START_BYTES("Compute harvensines", sizeof(struct point_pair)*pairs->count); - f64 *harvensines = malloc(pairs->count*sizeof(f64)); - for (int i = 0; i < pairs->count; i++) { - struct point_pair *p = &pairs->pairs[i]; - harvensines[i] = get_harvensine_distance(p->x0, p->y0, p->x1, p->y1, EARTH_RADIUS); - } - RPROF_STOP(); - - RPROF_START_BYTES("Sum harvensines", sizeof(f64)*pairs->count); - f64 harvensine_sum = 0; - for (int i = 0; i < pairs->count; i++) { - harvensine_sum += harvensines[i]; - } + f64 harvensine_sum = sum_harvensines_c(pairs); RPROF_STOP(); RPROF_START_BYTES("Free struct memory", sizeof(f64)*pairs->count + sizeof(struct point_pair)*pairs->count); free(reference_harvensines); - free(harvensines); RPROF_STOP(); RPROF_START("Free json memory"); free_json_value(parsed); @@ -206,23 +255,5 @@ int main(int argc, char **argv) rprof_output(rprof_cmp_by_exclusive_duration); - printf("\n------- Counters ------\n"); - - printf("Sin:\n"); - printf("Input: %f - %f\n", sin_counters.min_input, sin_counters.max_input); - printf("Output: %f - %f\n\n", sin_counters.min_output, sin_counters.max_output); - - printf("Cos:\n"); - printf("Input: %f - %f\n", cos_counters.min_input, cos_counters.max_input); - printf("Output: %f - %f\n\n", cos_counters.min_output, cos_counters.max_output); - - printf("Sqrt:\n"); - printf("Input: %f - %f\n", sqrt_counters.min_input, sqrt_counters.max_input); - printf("Output: %f - %f\n\n", sqrt_counters.min_output, sqrt_counters.max_output); - - printf("Asin:\n"); - printf("Input: %f - %f\n", asin_counters.min_input, asin_counters.max_input); - printf("Output: %f - %f\n", asin_counters.min_output, asin_counters.max_output); - return 0; } diff --git a/src/math_func_checker.c b/src/math_func_checker.c index 7bbac04..7a02186 100644 --- a/src/math_func_checker.c +++ b/src/math_func_checker.c @@ -2,7 +2,13 @@ #include #include #include +#include +#include "./harvensine_formula.c" + +typedef uint8_t u8; +typedef uint16_t u16; +typedef uint32_t u32; typedef double f64; typedef float f32; typedef uint64_t u64; @@ -22,12 +28,95 @@ struct math_spec { f64 to; }; +// From: https://github.com/cmuratori/computer_enhance/blob/main/perfaware/part4/listing_0184_sine_coefficients.inl +static f64 sine_optimized_coeffs[][11] = +{ + // NOTE(casey): This minimax coefficient table was donated by Demetri Spanos + + {}, + {}, + {0x1.fc4eac57b4a27p-1, -0x1.2b704cf682899p-3}, + {0x1.fff1d21fa9dedp-1, -0x1.53e2e5c7dd831p-3, 0x1.f2438d36d9dbbp-8}, + {0x1.ffffe07d31fe8p-1, -0x1.554f800fc5ea1p-3, 0x1.105d44e6222ap-7, -0x1.83b9725dff6e8p-13}, + {0x1.ffffffd25a681p-1, -0x1.555547ef5150bp-3, 0x1.110e7b396c557p-7, -0x1.9f6445023f795p-13, 0x1.5d38b56aee7f1p-19}, + {0x1.ffffffffd17d1p-1, -0x1.55555541759fap-3, 0x1.11110b74adb14p-7, -0x1.a017a8fe15033p-13, 0x1.716ba4fe56f6ep-19, -0x1.9a0e192a4e2cbp-26}, + {0x1.ffffffffffdcep-1, -0x1.5555555540b9bp-3, 0x1.111111090f0bcp-7, -0x1.a019fce979937p-13, 0x1.71dce5ace58d2p-19, -0x1.ae00fd733fe8dp-26, 0x1.52ace959bd023p-33}, + {0x1.fffffffffffffp-1, -0x1.5555555555469p-3, 0x1.111111110941dp-7, -0x1.a01a0199e0eb3p-13, 0x1.71de37e62aacap-19, -0x1.ae634d22bb47cp-26, 0x1.60e59ae00e00cp-33, -0x1.9ef5d594b342p-41}, + {0x1p0, -0x1.5555555555555p-3, 0x1.11111111110c9p-7, -0x1.a01a01a014eb6p-13, 0x1.71de3a52aab96p-19, -0x1.ae6454d960ac4p-26, 0x1.6123ce513b09fp-33, -0x1.ae43dc9bf8ba7p-41, 0x1.883c1c5deffbep-49}, + {0x1p0, -0x1.5555555555555p-3, 0x1.11111111110dcp-7, -0x1.a01a01a016ef6p-13, 0x1.71de3a53fa85cp-19, -0x1.ae6455b871494p-26, 0x1.612421756f93fp-33, -0x1.ae671378c3d43p-41, 0x1.90277dafc8ab9p-49, -0x1.78262e1f2709cp-58}, + {0x1p0, -0x1.5555555555555p-3, 0x1.11111111110dp-7, -0x1.a01a01a01559ap-13, 0x1.71de3a52ad36dp-19, -0x1.ae64549aa7ca9p-26, 0x1.612392f66fdcdp-33, -0x1.ae11556cad6c4p-41, 0x1.71744c339ad03p-49, 0x1.52947c90f8199p-55, -0x1.ff1898c107cfap-59}, +}; + +// From: https://github.com/cmuratori/computer_enhance/blob/main/perfaware/part4/listing_0187_arcsine_coefficients.inl +static f64 arcsine_taylor_coeffs[] = +{ + 1.0, + 0.1666666666666666666666666666666666666666666666666666666666666666666666666666667, + 0.075, + 0.04464285714285714285714285714285714285714285714285714285714285714285714285714286, + 0.03038194444444444444444444444444444444444444444444444444444444444444444444444444, + 0.02237215909090909090909090909090909090909090909090909090909090909090909090909091, + 0.01735276442307692307692307692307692307692307692307692307692307692307692307692308, + 0.01396484375, + 0.01155180089613970588235294117647058823529411764705882352941176470588235294117647, + 0.009761609529194078947368421052631578947368421052631578947368421052631578947368421, + 0.008390335809616815476190476190476190476190476190476190476190476190476190476190476, + 0.007312525873598845108695652173913043478260869565217391304347826086956521739130435, + 0.0064472103118896484375, + 0.005740037670841923466435185185185185185185185185185185185185185185185185185185185, + 0.005153309682319904195851293103448275862068965517241379310344827586206896551724138, + 0.004660143486915096159904233870967741935483870967741935483870967741935483870967742, + 0.004240907093679363077337091619318181818181818181818181818181818181818181818181818, + 0.003880964558837669236319405691964285714285714285714285714285714285714285714285714, + 0.003569205393825934545413867847339527027027027027027027027027027027027027027027027, + 0.003297059503473484745392432579627403846153846153846153846153846153846153846153846, + 0.003057821649258030669354810947325171493902439024390243902439024390243902439024390, + 0.002846178401108942167876764785411746002906976744186046511627906976744186046511628, + 0.002657870638207289933537443478902180989583333333333333333333333333333333333333333, + 0.002489448678246883494640759965206714386635638297872340425531914893617021276595745, + 0.002338091892111975186930883827866340170101243622448979591836734693877551020408163, + 0.002201473973710138205420020419885130489573759191176470588235294117647058823529412, + 0.002077661032518167442778473553019312192808906987028301886792452830188679245283019, + 0.001965033616277283618439303774555975740606134588068181818181818181818181818181818, + 0.001862226406403127489279102104646562222848858749657346491228070175438596491228070, +}; + +// From: https://github.com/cmuratori/computer_enhance/blob/main/perfaware/part4/listing_0187_arcsine_coefficients.inl +static f64 arsine_minmax_coeffs[][22] = +{ + // NOTE(casey): This minimax coefficient table was donated by Demetri Spanos + + {}, + {}, + {0x1.fdfcefbdd3154p-1 , 0x1.c427597754a37p-3}, + {0x1.0019e5b9a7693p0, 0x1.3b5f83d579a47p-3, 0x1.0da162d6fae3dp-3}, + {0x1.fffa004bed736p-1, 0x1.5acaca323d3aep-3, 0x1.a52ade47d967dp-5, 0x1.b0931e5a07f25p-4}, + {0x1.0000609783343p0, 0x1.543eb056cd449p-3, 0x1.52db50c86c17fp-4, 0x1.c36707c70d21cp-8, 0x1.8faf3815344ddp-4}, + {0x1.ffffe6586d628p-1, 0x1.558b2dc0be61cp-3, 0x1.2a2202ec5cb8p-4, 0x1.f96fb970de571p-5, -0x1.ac22c3939a9a9p-6, 0x1.912219085f248p-4}, + {0x1.000001c517503p0, 0x1.554b23dabce0bp-3, 0x1.35948cff7046bp-4, 0x1.391ca703d0d07p-5, 0x1.03149c11e9277p-4, -0x1.e523c15fbf438p-5, 0x1.a906b64a9bdc7p-4}, + {0x1.ffffff7f5bbbcp-1, 0x1.55573c38fd397p-3, 0x1.329cc1329ab48p-4, 0x1.7f40be8459b49p-5, 0x1.ea3ebd68f4abp-7, 0x1.4b1dad0e7b7e5p-4, -0x1.912818d0401bfp-4, 0x1.d3ec796e5ec8bp-4}, + {0x1.00000009557b8p0, 0x1.5554fb74b4bffp-3, 0x1.3356afe11c66p-4, 0x1.685b6636af595p-5, 0x1.2c25059387c7ep-5, -0x1.5b983df09138p-7, 0x1.db45568eb9217p-4, -0x1.2c7c4453a9b3ep-3, 0x1.0903eea6d1357p-3}, + {0x1.fffffffd3e442p-1, 0x1.555565c9beb43p-3, 0x1.332b1eab3d6f2p-4, 0x1.6f3ed264eef28p-5, 0x1.cc5fc8ed87fdbp-6, 0x1.38c39ea555adep-5, -0x1.88322c8ce661fp-5, 0x1.656a2ea43451dp-3, -0x1.aed7e0dfdbd27p-3, 0x1.32de0e0b3820fp-3}, + {0x1.0000000034db9p0, 0x1.5555525723f64p-3, 0x1.3334fd1dd69f5p-4, 0x1.6d4c8c3659p-5, 0x1.fe5b240c320ebp-6, 0x1.0076fe3314273p-6, 0x1.b627b3be92bd4p-5, -0x1.ba657aa72abeep-4, 0x1.103aa8bb00a4ep-2, -0x1.2deb335977b56p-2, 0x1.699a7715830d2p-3}, + {0x1.ffffffffeff9dp-1, 0x1.555555dff5e06p-3, 0x1.3332d0221f548p-4, 0x1.6dd27e8c33d52p-5, 0x1.edd05e3dff008p-6, 0x1.992d0b8b03f01p-6, -0x1.ac779f1be0507p-13, 0x1.73bb5b359003ap-4, -0x1.a8326f2354f8ap-3, 0x1.9e1b8885f9661p-2, -0x1.a1b0aa236a282p-2, 0x1.b038b25a40e08p-3}, + {0x1.00000000013ap0, 0x1.5555553c5c8a7p-3, 0x1.33334839e1acap-4, 0x1.6dafeb7453ee6p-5, 0x1.f2f65baf85a8cp-6, 0x1.5f396c79d5687p-6, 0x1.9a8031b47fd85p-6, -0x1.cbd84d319158p-6, 0x1.53df7e2c17602p-3, -0x1.7a954b7cb46e6p-2, 0x1.38e97b1392a69p-1, -0x1.1eabdc3fe561ap-1, 0x1.056424720e768p-2}, + {0x1.ffffffffff9fp-1, 0x1.55555559d0d4p-3, 0x1.33332ecf01c13p-4, 0x1.6db88c4cfe8eap-5, 0x1.f17068ec7ac68p-6, 0x1.73b9408ccb9b1p-6, 0x1.d2a82629eb78ep-7, 0x1.1b4dda11bb1d2p-5, -0x1.5210c527bd7ep-4, 0x1.3a638b5965e45p-2, -0x1.4434b98838c1dp-1, 0x1.d52ccc09ba2cdp-1, -0x1.8792b45ef365ep-1, 0x1.3f5545e9e11eap-2}, + {0x1.0000000000079p0, 0x1.5555555487dd3p-3, 0x1.3333341adb0b8p-4, 0x1.6db67483a8f77p-5, 0x1.f1defdcf41a11p-6, 0x1.6ce213041c326p-6, 0x1.2f8bd23b33763p-6, 0x1.34a6d9f27428dp-8, 0x1.007f36ef69d66p-4, -0x1.850e0d65729e1p-3, 0x1.1f42350f23ccep-1, -0x1.0e0b5512f8d35p0, 0x1.5d065bf34c03ep0, -0x1.0a98c5604a5c6p0, 0x1.8978c6502660ap-2}, + {0x1.fffffffffffdap-1, 0x1.555555557a085p-3, 0x1.33333304070d3p-4, 0x1.6db6f35f4ac13p-5, 0x1.f1c0bdf8248f6p-6, 0x1.6f0e61397193p-6, 0x1.15740f26a5e24p-6, 0x1.24069344266aap-6, -0x1.c02ef74c5e655p-7, 0x1.07833aeac1562p-3, -0x1.97487ee8ceb5p-2, 0x1.0178f7f5c01bdp0, -0x1.b8b2ea879a2a5p0, 0x1.01ccfbe6e1f6ap1, -0x1.6a46d11c16386p0, 0x1.e86a774524862p-2}, + {0x1.0000000000003p0, 0x1.555555554ecb4p-3, 0x1.3333333cb4a27p-4, 0x1.6db6d5f669d29p-5, 0x1.f1c8c3485860bp-6, 0x1.6e64f7828f426p-6, 0x1.1ea1dc340da9p-6, 0x1.98123c756ff58p-7, 0x1.7a0c83f514b22p-6, -0x1.bd6eb7cdaf8e4p-5, 0x1.162743c14bf13p-2, -0x1.944c737b04ef5p-1, 0x1.c47bd23ee68a2p0, -0x1.61d1590acbbfp1, 0x1.7a6e0b194804dp1, -0x1.eba481b8f24dfp0, 0x1.311805d4c6d33p-1}, + {0x1.fffffffffffffp-1, 0x1.555555555683fp-3, 0x1.3333333148aa7p-4, 0x1.6db6dca9f82d4p-5, 0x1.f1c6b0ea300d7p-6, 0x1.6e96be6dbe49ep-6, 0x1.1b8cc838ee86ep-6, 0x1.dc086c5d99cdcp-7, 0x1.b1b8d27cd7e72p-8, 0x1.5565a3d3908b9p-5, -0x1.2ab04ba9012e3p-3, 0x1.224c4dbe13cbdp-1, -0x1.83633c76e4551p0, 0x1.86bbff2a6c7b6p1, -0x1.188f223fe5f34p2, 0x1.14672d35db97ep2, -0x1.4d84801ff1aa1p1, 0x1.7f820d52c2775p-1}, + {0x1p0, 0x1.555555555531ep-3, 0x1.3333333380df2p-4, 0x1.6db6db3184756p-5, 0x1.f1c73443a02f5p-6, 0x1.6e88ce94d1149p-6, 0x1.1c875d6c5323dp-6, 0x1.c37061f4e5f55p-7, 0x1.b8a33b8e380efp-7, -0x1.21438ccc95d62p-8, 0x1.69b370aad086ep-4, -0x1.57380bcd2890ep-2, 0x1.1fb54da575b22p0, -0x1.6067d334b4792p1, 0x1.4537ddde2d76dp2, -0x1.b06f523e74f33p2, 0x1.8bf4dadaf548cp2, -0x1.bec6daf74ed61p1, 0x1.dfc53682725cap-1}, + {0x1p0, 0x1.55555555555bap-3, 0x1.3333333323ebcp-4, 0x1.6db6db7adc18bp-5, 0x1.f1c716a8f3363p-6, 0x1.6e8c66fac48d5p-6, 0x1.1c3da3ac97e63p-6, 0x1.cbb180b74d85dp-7, 0x1.62b81445afbfdp-7, 0x1.050a65cdec399p-6, -0x1.018ae6d82506cp-5, 0x1.a361973086e84p-3, -0x1.7f8907c1978c3p-1, 0x1.1debe7d3f064p1, -0x1.411c99c675e12p2, 0x1.106a078008a9ap3, -0x1.500975aa37fb8p3, 0x1.1ea75d01d0debp3, -0x1.2ee507d6a1a5fp2, 0x1.3070aa6a5b88ep0}, + {0x1p0, 0x1.5555555555544p-3, 0x1.3333333336209p-4, 0x1.6db6db6aeb726p-5, 0x1.f1c71dcf049c4p-6, 0x1.6e8b6f8df785cp-6, 0x1.1c53c3234c54p-6, 0x1.c8eb3e8133ceap-7, 0x1.8335ee4136147p-7, 0x1.d9a5ff05f747ep-8, 0x1.b949ad43fb2bdp-6, -0x1.9080df821c302p-4, 0x1.e245cd46c886cp-2, -0x1.99434e2a3223ap0, 0x1.147d4d3b7ec76p2, -0x1.1e2a8ce097204p3, 0x1.c17aa6abf54eap3, -0x1.02778b2d86e57p4, 0x1.9ccd7e4c0706bp3, -0x1.9a13424bd53c2p2, 0x1.837ec3890fee1p0}, + {0x1p0, 0x1.5555555555558p-3, 0x1.3333333332aedp-4, 0x1.6db6db6e45234p-5, 0x1.f1c71c24301p-6, 0x1.6e8baf9ddc763p-6, 0x1.1c4d64d353371p-6, 0x1.c9cf1f8de89e6p-7, 0x1.778d723247697p-7, 0x1.5fcac651d07d4p-7, 0x1.799c2f33c0274p-12, 0x1.e288894a8bc33p-5, -0x1.0446ef7fdb149p-2, 0x1.0ba0fa7048fb2p0, -0x1.a273e0e74ee85p1, 0x1.034f776a3db58p3, -0x1.f1adf47b08719p3, 0x1.6c271c319b92ap4, -0x1.886f83ada1ccfp4, 0x1.26c247c3a321bp4, -0x1.146482ddd5f29p3, 0x1.ed3ada8793e41p0}, +}; + static f64 square(f64 num) { return num * num; } -static f64 my_sin(f64 num) +static f64 my_sin_approx(f64 num) { f64 sign = 1; if (num < 0) { @@ -91,7 +180,7 @@ static f64 my_cos_using_sin(f64 num) return my_sin_quarter_fitted(num + M_PI/2); } -static f64 my_cos(f64 num) +static f64 my_cos_approx(f64 num) { if (num < 0) { num *= -1; @@ -119,11 +208,6 @@ static f64 my_sqrt2(f64 num) return result; } -static f64 my_asin(f64 num) -{ - return num; -} - static f64 factorial(size_t n) { f64 result = 1; @@ -144,7 +228,7 @@ static void create_taylor_series_coeffs(f64 *coeffs, size_t coeffs_len) } } -static f64 sin_using_taylor(f64 input, f64 *coeffs, size_t coeffs_len) +static f64 compute_taylor_series(f64 input, f64 *coeffs, size_t coeffs_len) { f64 result = 0; for (size_t i = 0; i < coeffs_len; i++) { @@ -154,7 +238,7 @@ static f64 sin_using_taylor(f64 input, f64 *coeffs, size_t coeffs_len) return result; } -static f64 sin_using_taylor_horner(f64 input, f64 *coeffs, size_t coeffs_len) +static f64 compute_taylor_series_horner(f64 input, f64 *coeffs, size_t coeffs_len) { f64 input_sqr = input * input; @@ -171,6 +255,88 @@ static f64 sin_using_taylor_horner(f64 input, f64 *coeffs, size_t coeffs_len) return result; } +static f64 compute_taylor_series_horner_fma(f64 input, f64 *coeffs, size_t coeffs_len) +{ + f64 input_sqr = input * input; + + f64 result = coeffs[coeffs_len - 1]; + for (size_t j = 1; j < coeffs_len; j++) { + size_t i = coeffs_len - j - 1; + + result = fma(result, input_sqr, coeffs[i]); + } + + result *= input; + + return result; +} + +static f64 sin_best(f64 input) +{ + f64 sign = 1; + if (input < 0) { + sign *= -1; + input = -input; + } + + if (input > M_PI / 2) { + input = M_PI - input; + } + + f64 input_sqr = input * input; + + f64 result = 0x1.883c1c5deffbep-49; + result = fma(result, input_sqr, -0x1.ae43dc9bf8ba7p-41); + result = fma(result, input_sqr, 0x1.6123ce513b09fp-33); + result = fma(result, input_sqr, -0x1.ae6454d960ac4p-26); + result = fma(result, input_sqr, 0x1.71de3a52aab96p-19); + result = fma(result, input_sqr, -0x1.a01a01a014eb6p-13); + result = fma(result, input_sqr, 0x1.11111111110c9p-7); + result = fma(result, input_sqr, -0x1.5555555555555p-3); + result = fma(result, input_sqr, 0x1p0); + result *= input; + + return result * sign; +} + +static f64 asin_best(f64 input) +{ + f64 scalar = 1; + f64 offset = 0; + f64 one_over_sqrt_2 = 1/sqrt(2); + if (input > one_over_sqrt_2) { + input = my_sqrt1(1 - input * input); + scalar = -1; + offset = M_PI/2; + } + + f64 input_sqr = input * input; + + f64 result = 0x1.3070aa6a5b88ep0; + result = fma(result, input_sqr, -0x1.2ee507d6a1a5fp2); + result = fma(result, input_sqr, 0x1.1ea75d01d0debp3); + result = fma(result, input_sqr, -0x1.500975aa37fb8p3); + result = fma(result, input_sqr, 0x1.106a078008a9ap3); + result = fma(result, input_sqr, -0x1.411c99c675e12p2); + result = fma(result, input_sqr, 0x1.1debe7d3f064p1); + result = fma(result, input_sqr, -0x1.7f8907c1978c3p-1); + result = fma(result, input_sqr, 0x1.a361973086e84p-3); + result = fma(result, input_sqr, -0x1.018ae6d82506cp-5); + result = fma(result, input_sqr, 0x1.050a65cdec399p-6); + result = fma(result, input_sqr, 0x1.62b81445afbfdp-7); + result = fma(result, input_sqr, 0x1.cbb180b74d85dp-7); + result = fma(result, input_sqr, 0x1.1c3da3ac97e63p-6); + result = fma(result, input_sqr, 0x1.6e8c66fac48d5p-6); + result = fma(result, input_sqr, 0x1.f1c716a8f3363p-6); + result = fma(result, input_sqr, 0x1.6db6db7adc18bp-5); + result = fma(result, input_sqr, 0x1.3333333323ebcp-4); + result = fma(result, input_sqr, 0x1.55555555555bap-3); + result = fma(result, input_sqr, 0x1p0); + result *= input; + + return fma(result, scalar, offset); +} + static void tester_record_result(struct tester *tester, f64 input, f64 my_output, f64 reference_output) { f64 error = reference_output - my_output; @@ -186,7 +352,7 @@ static void tester_record_result(struct tester *tester, f64 input, f64 my_output static void tester_show_result(struct tester *tester, const char *name) { - printf("%20s max error %.16f at %f\n", name, tester->max_error, tester->max_error_at); + printf("%20s max error %.24f at %f\n", name, tester->max_error, tester->max_error_at); } static f64 lerp(f64 x, f64 from, f64 to) @@ -200,11 +366,13 @@ static void test_taylor_series(u64 samples, f64 *coeffs, size_t coeffs_len) struct tester tester = { }; struct tester tester_horner = { }; + struct tester tester_horner_fma = { }; for (int j = 0; j < samples; j++) { f64 input = lerp((f64)j / samples, 0, M_PI/2); - tester_record_result(&tester, input, sin_using_taylor(input, coeffs, coeffs_len), sin(input)); - tester_record_result(&tester_horner, input, sin_using_taylor_horner(input, coeffs, coeffs_len), sin(input)); + tester_record_result(&tester, input, compute_taylor_series(input, coeffs, coeffs_len), sin(input)); + tester_record_result(&tester_horner, input, compute_taylor_series_horner(input, coeffs, coeffs_len), sin(input)); + tester_record_result(&tester_horner_fma, input, compute_taylor_series_horner_fma(input, coeffs, coeffs_len), sin(input)); } { @@ -219,16 +387,61 @@ static void test_taylor_series(u64 samples, f64 *coeffs, size_t coeffs_len) tester_show_result(&tester_horner, name); } - printf(" diff %.16f\n", tester.max_error - tester_horner.max_error); + { + char name[64] = { 0 }; + snprintf(name, sizeof(name), "taylor f %zu", coeffs_len); + tester_show_result(&tester_horner, name); + } + + printf(" diff normal - horner %.24f\n", tester.max_error - tester_horner.max_error); + printf(" diff horner - fused %.24f\n", tester_horner.max_error - tester_horner_fma.max_error); } -int main() +static u8 rand_u8() { + return rand() & 0xFF; +} +static u16 rand_u16() { + return rand_u8() | (rand_u8() << 8); +} +static u32 rand_u32() { + return rand_u16() | (rand_u16() << 16); +} + +// https://stackoverflow.com/a/55766267 +static f64 rand_f64() { + u64 r53 = ((uint64_t)(rand_u32()) << 21) ^ (rand_u8() >> 2); + return (double)r53 / 9007199254740991.0; // 2^53 - 1 +} + +static f64 rand_f64_range(f64 from, f64 to) { + return rand_f64() * (to - from) + from; +} + +static f64 get_harvensine_distance_reference(f64 X0, f64 Y0, f64 X1, f64 Y1, f64 earth_radius) +{ + f64 lat1 = Y0; + f64 lat2 = Y1; + f64 lon1 = X0; + f64 lon2 = X1; + + f64 dLat = radians_to_degrees(lat2 - lat1); + f64 dLon = radians_to_degrees(lon2 - lon1); + lat1 = radians_to_degrees(lat1); + lat2 = radians_to_degrees(lat2); + + f64 a = sqaure_f64(sin(dLat/2.0)) + cos(lat1)*cos(lat2)*sqaure_f64(sin(dLon/2)); + f64 c = 2.0*asin(sqrt(a)); + + return earth_radius * c; +} + +static void sin_tests() { uint64_t samples = 100000; struct math_spec specs[] = { { .name = "sin", - .mine = my_sin, + .mine = my_sin_approx, .reference = sin, .from = -M_PI, .to = M_PI @@ -254,9 +467,16 @@ int main() .from = -M_PI, .to = M_PI }, + { + .name = "sin_best", + .mine = sin_best, + .reference = sin, + .from = -M_PI, + .to = M_PI + }, { .name = "cos", - .mine = my_cos, + .mine = my_cos_approx, .reference = cos, .from = -M_PI/2, .to = M_PI/2 @@ -274,13 +494,6 @@ int main() .reference = sqrt, .from = 0, .to = 1, - }, - { - .name = "asin", - .mine = my_asin, - .reference = asin, - .from = 0, - .to = 1, } }; @@ -297,10 +510,113 @@ int main() tester_show_result(&tester, spec.name); } - f64 taylor_coeffs[10]; + for (int i = 2; i < ARRAY_LEN(sine_optimized_coeffs); i++) { + struct tester tester = { }; + + f64 *coeffs = sine_optimized_coeffs[i]; + size_t coeffs_len = i; + + for (int j = 0; j < samples; j++) { + f64 input = lerp((f64)j / samples, 0, M_PI/2); + tester_record_result(&tester, input, compute_taylor_series_horner_fma(input, coeffs, coeffs_len), sin(input)); + } + + char name[64] = { 0 }; + snprintf(name, sizeof(name), "optimized taylor %zu", coeffs_len); + tester_show_result(&tester, name); + } + + f64 taylor_coeffs[13]; for (int i = 1; i < ARRAY_LEN(taylor_coeffs); i++) { test_taylor_series(samples, taylor_coeffs, i); } +} + +static void arcsin_tests() +{ + uint64_t samples = 100000; + + f64 from = 0; + f64 to = 1/sqrt(2); + + for (int i = 0; i < ARRAY_LEN(arcsine_taylor_coeffs) - 1; i++) { + f64 *coeffs = arcsine_taylor_coeffs; + size_t coeffs_len = i + 1; + + struct tester tester = { }; + + for (int j = 0; j < samples; j++) { + f64 input = lerp((f64)j / samples, from, to); + tester_record_result(&tester, input, compute_taylor_series_horner_fma(input, coeffs, coeffs_len), asin(input)); + } + + char name[64] = { 0 }; + snprintf(name, sizeof(name), "taylor %zu", coeffs_len); + tester_show_result(&tester, name); + } + + for (int i = 2; i < ARRAY_LEN(arsine_minmax_coeffs) - 1; i++) { + f64 *coeffs = arsine_minmax_coeffs[i]; + size_t coeffs_len = i + 1; + + struct tester tester = { }; + + for (int j = 0; j < samples; j++) { + f64 input = lerp((f64)j / samples, from, to); + tester_record_result(&tester, input, compute_taylor_series_horner_fma(input, coeffs, coeffs_len), asin(input)); + } + + char name[64] = { 0 }; + snprintf(name, sizeof(name), "minmax %zu", coeffs_len); + tester_show_result(&tester, name); + } + + { + struct tester tester = { }; + + for (int j = 0; j < samples; j++) { + f64 input = lerp((f64)j / samples, 0, 1); + tester_record_result(&tester, input, asin_best(input), asin(input)); + } + + tester_show_result(&tester, "best"); + } +} + +static void harvensine_tests() +{ + uint64_t samples = 100000; + + f64 max_difference = 0; + + u32 seed = time(NULL); + srand(seed); + + for (int i = 0; i < samples; i++) { + f64 x0 = rand_f64_range(-180, 180); + f64 y0 = rand_f64_range(-90, 90); + f64 x1 = rand_f64_range(-180, 180); + f64 y1 = rand_f64_range(-90, 90); + + f64 expected = get_harvensine_distance_reference(x0, y0, x1, y1, EARTH_RADIUS); + f64 actual = get_harvensine_distance(x0, y0, x1, y1, EARTH_RADIUS); + + max_difference = MAX(max_difference, expected - actual); + } + + printf("reference vs actual, max error: %.17f\n", max_difference); +} + +int main() +{ + // printf("---------- sin() -------------\n"); + // sin_tests(); + // printf("--------- asin() -------------\n"); + // arcsin_tests(); + // printf("%.24f\n", 1.0 / sqrt(2)); + + printf("--------- haversine() -------------\n"); + harvensine_tests(); return 0; }