#include #include #include #include #include using namespace std; int main() { typedef chrono::high_resolution_clock clock_t; clock_t::time_point t0, t1; size_t const datasize = 840; size_t const iterations = 1000; double stats[9] = { 0.0 }; double trash[9] = { 0.0 }; double *data = new double[datasize]; for (size_t i = 0; i < datasize; i++) { double k = sin(77.4567 * i); data[i] = 20.0 * (k * 17.0 - floor(k * 17.0) - 0.5); } for (size_t run = 0; run < iterations; run++) { double x = 1.0 * run / iterations; double y0, y1, y2, y3, y4, y5, y6, y7; double xn, x2, x3, x4, x5, x6, x7, x8; t0 = clock_t::now(); y0 = 0.0; xn = 1.0; for (size_t i = 0; i < datasize; i++) { y0 += xn * data[i]; xn *= x; } t1 = clock_t::now(); stats[0] += chrono::nanoseconds(t1 - t0).count(); trash[0] += y0; t0 = clock_t::now(); y0 = 0.0; for (size_t i = datasize; i--; ) y0 = y0 * x + data[i]; t1 = clock_t::now(); stats[1] += chrono::nanoseconds(t1 - t0).count(); trash[1] += y0; t0 = clock_t::now(); y0 = y1 = 0.0; x2 = x * x; for (size_t i = datasize; i; i -= 2) { y0 = y0 * x2 + data[i - 1]; y1 = y1 * x2 + data[i - 2]; } y0 = y0 * x + y1; t1 = clock_t::now(); stats[2] += chrono::nanoseconds(t1 - t0).count(); trash[2] += y0; t0 = clock_t::now(); y0 = y1 = y2 = 0.0; x2 = x * x; x3 = x2 * x; for (size_t i = datasize; i; i -= 3) { y0 = y0 * x3 + data[i - 1]; y1 = y1 * x3 + data[i - 2]; y2 = y2 * x3 + data[i - 3]; } y0 = y0 * x2 + y1 * x + y2; t1 = clock_t::now(); stats[3] += chrono::nanoseconds(t1 - t0).count(); trash[3] += y0; t0 = clock_t::now(); y0 = y1 = y2 = y3 = 0.0; x2 = x * x; x3 = x2 * x; x4 = x2 * x2; for (size_t i = datasize; i; i -= 4) { y0 = y0 * x4 + data[i - 1]; y1 = y1 * x4 + data[i - 2]; y2 = y2 * x4 + data[i - 3]; y3 = y3 * x4 + data[i - 4]; } y0 = y0 * x3 + y1 * x2 + y2 * x + y3; t1 = clock_t::now(); stats[4] += chrono::nanoseconds(t1 - t0).count(); trash[4] += y0; t0 = clock_t::now(); y0 = y1 = y2 = y3 = y4 = 0.0; x2 = x * x; x3 = x2 * x; x4 = x2 * x2; x5 = x2 * x3; for (size_t i = datasize; i; i -= 5) { y0 = y0 * x5 + data[i - 1]; y1 = y1 * x5 + data[i - 2]; y2 = y2 * x5 + data[i - 3]; y3 = y3 * x5 + data[i - 4]; y4 = y4 * x5 + data[i - 5]; } y0 = y0 * x4 + y1 * x3 + y2 * x2 + y3 * x + y4; t1 = clock_t::now(); stats[5] += chrono::nanoseconds(t1 - t0).count(); trash[5] += y0; t0 = clock_t::now(); y0 = y1 = y2 = y3 = y4 = y5 = 0.0; x2 = x * x; x3 = x2 * x; x4 = x2 * x2; x5 = x2 * x3; x6 = x3 * x3; for (size_t i = datasize; i; i -= 6) { y0 = y0 * x6 + data[i - 1]; y1 = y1 * x6 + data[i - 2]; y2 = y2 * x6 + data[i - 3]; y3 = y3 * x6 + data[i - 4]; y4 = y4 * x6 + data[i - 5]; y5 = y5 * x6 + data[i - 6]; } y0 = y0 * x5 + y1 * x4 + y2 * x3 + y3 * x2 + y4 * x + y5; t1 = clock_t::now(); stats[6] += chrono::nanoseconds(t1 - t0).count(); trash[6] += y0; t0 = clock_t::now(); y0 = y1 = y2 = y3 = y4 = y5 = y6 = 0.0; x2 = x * x; x3 = x2 * x; x4 = x2 * x2; x5 = x2 * x3; x6 = x3 * x3; x7 = x4 * x3; for (size_t i = datasize; i; i -= 7) { y0 = y0 * x7 + data[i - 1]; y1 = y1 * x7 + data[i - 2]; y2 = y2 * x7 + data[i - 3]; y3 = y3 * x7 + data[i - 4]; y4 = y4 * x7 + data[i - 5]; y5 = y5 * x7 + data[i - 6]; y6 = y6 * x7 + data[i - 7]; } y0 = y0 * x6 + y1 * x5 + y2 * x4 + y3 * x3 + y4 * x2 + y5 * x + y6; t1 = clock_t::now(); stats[7] += chrono::nanoseconds(t1 - t0).count(); trash[7] += y0; t0 = clock_t::now(); y0 = y1 = y2 = y3 = y4 = y5 = y6 = y7 = 0.0; x2 = x * x; x3 = x2 * x; x4 = x2 * x2; x5 = x2 * x3; x6 = x3 * x3; x7 = x4 * x3; x8 = x4 * x4; for (size_t i = datasize; i; i -= 8) { y0 = y0 * x8 + data[i - 1]; y1 = y1 * x8 + data[i - 2]; y2 = y2 * x8 + data[i - 3]; y3 = y3 * x8 + data[i - 4]; y4 = y4 * x8 + data[i - 5]; y5 = y5 * x8 + data[i - 6]; y6 = y6 * x8 + data[i - 7]; y7 = y7 * x8 + data[i - 8]; } y0 = y0 * x7 + y1 * x6 + y2 * x5 + y3 * x4 + y4 * x3 + y5 * x2 + y6 * x + y7; t1 = clock_t::now(); stats[8] += chrono::nanoseconds(t1 - t0).count(); trash[8] += y0; } cout << "Brute: " << stats[0] * (1.0 / iterations) << " ns" << endl; cout << "Horner: " << stats[1] * (1.0 / iterations) << " ns (" << 100.0 * stats[1] / stats[0] << "%)" << endl; cout << "Hocevar: " << stats[2] * (1.0 / iterations) << " ns (" << 100.0 * stats[2] / stats[0] << "%)" << endl; cout << "Hocevar3: " << stats[3] * (1.0 / iterations) << " ns (" << 100.0 * stats[3] / stats[0] << "%)" << endl; cout << "Hocevar4: " << stats[4] * (1.0 / iterations) << " ns (" << 100.0 * stats[4] / stats[0] << "%)" << endl; cout << "Hocevar5: " << stats[5] * (1.0 / iterations) << " ns (" << 100.0 * stats[5] / stats[0] << "%)" << endl; cout << "Hocevar6: " << stats[6] * (1.0 / iterations) << " ns (" << 100.0 * stats[6] / stats[0] << "%)" << endl; cout << "Hocevar7: " << stats[7] * (1.0 / iterations) << " ns (" << 100.0 * stats[7] / stats[0] << "%)" << endl; cout << "Hocevar8: " << stats[8] * (1.0 / iterations) << " ns (" << 100.0 * stats[8] / stats[0] << "%)" << endl; #if 0 clock_t::time_point const t0 = clock_t::now(); // specialized conversion real r1(0); #pragma omp parallel for for(std::size_t count = 0; count < iterations; ++count) for (std::size_t i = 0; i < datasize; i++) r1 += uint8_t_to_float(data[i]); clock_t::time_point const t1 = clock_t::now(); // standard conversion real r2(0); #pragma omp parallel for for(std::size_t count = 0; count < iterations; ++count) for (std::size_t i = 0; i < datasize; i++) r2 += static_cast(data[i]); clock_t::time_point const t2 = clock_t::now(); std::cout << "specialized : sum: " << r1 << ", time: " << std::chrono::nanoseconds(t1 - t0).count() * 1e-9 << "s\n" "standard : sum: " << r2 << ", time: " << std::chrono::nanoseconds(t2 - t1).count() * 1e-9 << "s\n"; #endif delete data; if (trash[0] != trash[1] || trash[0] != trash[2] || trash[0] != trash[3] || trash[0] != trash[4] || trash[0] != trash[5] || trash[0] != trash[6] || trash[0] != trash[7] || trash[0] != trash[8]) { cout << setprecision(20); cout << trash[0] << endl; cout << trash[1] << endl; cout << trash[2] << endl; cout << trash[3] << endl; cout << trash[4] << endl; cout << trash[5] << endl; cout << trash[6] << endl; cout << trash[7] << endl; cout << trash[8] << endl; } return 0; }