mirror of
https://github.com/libretro/RetroArch.git
synced 2024-12-01 04:00:32 +00:00
169 lines
8.9 KiB
Plaintext
169 lines
8.9 KiB
Plaintext
|
|
||
|
Library: MATH-NEON
|
||
|
By: Lachlan Tychsen-Smith
|
||
|
Licence: MIT (expat)
|
||
|
=======================================================================================
|
||
|
This project implements the cmath functions and some optimised matrix functions
|
||
|
with the aim of increasing the floating point performance of ARM Cortex A-8
|
||
|
based platforms. As well as implementing the functions in ARM NEON assembly,
|
||
|
they sacrifice error checking and some accuracy to achieve better performance.
|
||
|
|
||
|
Function Errors:
|
||
|
=======================================================================================
|
||
|
The measurement and characterisations of the inaccuracies present within these
|
||
|
functions is really a field within itself. For the benchmark i provide the
|
||
|
maximum absolute, maximum relative and root mean squared error compared to the
|
||
|
cmath implementations over the specified range. However these values can be
|
||
|
misleading, especially for functions which quickly go to infinity. So its always a
|
||
|
good idea to test it within your actual program. In general, this library will not
|
||
|
be as accurate as cmath, however for many functions it is close enough to be
|
||
|
negilible.
|
||
|
|
||
|
Notes:
|
||
|
=======================================================================================
|
||
|
- The *_c functions are c implementations of the *_neon code.
|
||
|
- Like cmath, The errors present in the functions are very dependent on the
|
||
|
range which your operating in. So you should test them first.
|
||
|
- Look in the "math_neon.h" file for discriptions of the functions. In some
|
||
|
function files there are also notes on the specific implementation.
|
||
|
- The *_neon functions make certain assumptions about the location of arguments
|
||
|
that is incompatible with inlining.
|
||
|
|
||
|
Contact:
|
||
|
=======================================================================================
|
||
|
Name: Lachlan Tychsen-Smith
|
||
|
Email: lachlan.ts@gmail.com
|
||
|
|
||
|
PSVITA performances test results:
|
||
|
|
||
|
RUNFAST: Disabled
|
||
|
------------------------------------------------------------------------------------------------------
|
||
|
MATRIX FUNCTION TESTS
|
||
|
------------------------------------------------------------------------------------------------------
|
||
|
matmul2_c =
|
||
|
|-14.56, 5.96|
|
||
|
|-15.35, 10.50|
|
||
|
matmul2_neon =
|
||
|
|-14.56, 5.96|
|
||
|
|-15.35, 10.50|
|
||
|
matmul2: c=174924 neon=64490 rate=2.71
|
||
|
matvec2_c = |-14.56, -15.35|
|
||
|
matvec2_neon = |-14.56, -15.35|
|
||
|
matvec2: c=88957 neon=58337 rate=1.52
|
||
|
matmul3_c =
|
||
|
|-21.39, -4.68, -1.74|
|
||
|
|-8.66, -8.97, 1.83|
|
||
|
|15.88, 0.30, -2.23|
|
||
|
matmul3_neon =
|
||
|
|-21.39, -4.68, -1.74|
|
||
|
|-8.66, -8.97, 1.83|
|
||
|
|15.88, 0.30, -2.23|
|
||
|
matmul3: c=552486 neon=297268 rate=1.86
|
||
|
matvec3_c = |-21.39, -8.66, 15.88|
|
||
|
matvec3_neon = |-21.39, -8.66, 15.88|
|
||
|
matvec3: c=184104 neon=128780 rate=1.43
|
||
|
matmul4_c =
|
||
|
|-13.65, -1.80, -12.92, 6.56|
|
||
|
|-10.21, 9.47, 2.73, 14.79|
|
||
|
|0.97, 11.69, -0.64, -12.87|
|
||
|
|20.06, 6.77, 35.61, -0.02|
|
||
|
matmul4_neon =
|
||
|
|-13.65, -1.80, -12.92, 6.56|
|
||
|
|-10.21, 9.47, 2.73, 14.79|
|
||
|
|0.97, 11.69, -0.64, -12.87|
|
||
|
|20.06, 6.77, 35.61, -0.02|
|
||
|
matmul4: c=1315568 neon=254227 rate=5.17
|
||
|
matvec4_c = |-13.65, -10.21, 0.97, 20.058556|
|
||
|
matvec4_neon = |-13.65, -10.21, 0.97, 20.058556|
|
||
|
matvec4: c=331712 neon=147196 rate=2.25
|
||
|
|
||
|
dot2_c = -10.903330
|
||
|
dot2_neon = -10.903330
|
||
|
dot2: c=230295 neon=168799 rate=1.36
|
||
|
normalize2_c = [-0.74, 0.67]
|
||
|
normalize2_neon = [-0.74, 0.67]
|
||
|
normalize2: c=950716 neon=965780 rate=0.98
|
||
|
|
||
|
dot3_c = -4.226746
|
||
|
dot3_neon = -4.226746
|
||
|
dot3: c=306957 neon=337316 rate=0.91
|
||
|
normalize3_c = [-0.69, 0.62, -0.38]
|
||
|
normalize3_neon = [-0.69, 0.62, -0.38]
|
||
|
normalize3: c=1180950 neon=1134557 rate=1.04
|
||
|
cross3_c = [-9.67, -19.39, -14.24]
|
||
|
cross3_neon = [-9.67, -19.39, -14.24]
|
||
|
cross3: c=659558 neon=766896 rate=0.86
|
||
|
|
||
|
dot4_c = 2.782796
|
||
|
dot4_neon = 2.782796
|
||
|
dot4: c=414233 neon=276068 rate=1.50
|
||
|
normalize4_c = [-0.59, 0.53, -0.32, -0.52]
|
||
|
normalize4_neon = [-0.59, 0.53, -0.32, -0.52]
|
||
|
normalize4: c=1364294 neon=1103327 rate=1.24
|
||
|
|
||
|
------------------------------------------------------------------------------------------------------
|
||
|
CMATH FUNCTION TESTS
|
||
|
------------------------------------------------------------------------------------------------------
|
||
|
Function Range Number ABS Max Error REL Max Error RMS Error Time Rate
|
||
|
------------------------------------------------------------------------------------------------------
|
||
|
sinf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1394459996 x1.00
|
||
|
sinf_c [-3.14, 3.14] 500000 7.75e-07 1.00e+02% 4.09e-07 1395128226 x1.00
|
||
|
sinf_neon [-3.14, 3.14] 500000 8.34e-07 1.00e+02% 4.09e-07 1395853554 x1.00
|
||
|
cosf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1396644271 x1.00
|
||
|
cosf_c [-3.14, 3.14] 500000 7.75e-07 6.74e-01% 4.15e-07 1397360321 x1.00
|
||
|
cosf_neon [-3.14, 3.14] 500000 8.34e-07 6.74e-01% 4.16e-07 1398126872 x1.00
|
||
|
tanf [-0.79, 0.79] 500000 0.00e+00 0.00e+00% 0.00e+00 1398889596 x1.00
|
||
|
tanf_c [-0.79, 0.79] 500000 2.98e-06 7.94e-04% 1.31e-06 1399704712 x1.00
|
||
|
tanf_neon [-0.79, 0.79] 500000 1.91e-06 3.62e-04% 6.66e-07 1400612899 x1.00
|
||
|
asinf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1401838993 x1.00
|
||
|
asinf_c [-1.00, 1.00] 500000 5.54e-05 1.06e-02% nan 1402745512 x1.00
|
||
|
asinf_neon [-1.00, 1.00] 500000 4.66e-05 8.90e-03% nan 1403967661 x1.00
|
||
|
acosf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1405317842 x1.00
|
||
|
acosf_c [-1.00, 1.00] 500000 5.56e-05 6.46e-03% nan 1406294753 x1.00
|
||
|
acosf_neon [-1.00, 1.00] 500000 4.67e-05 6.35e-03% nan 1407598039 x1.00
|
||
|
atanf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1408314869 x1.00
|
||
|
atanf_c [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 1408872421 x1.00
|
||
|
atanf_neon [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 1409736652 x1.00
|
||
|
sinhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1411101066 x1.00
|
||
|
sinhf_c [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.85e-07 1412173492 x1.00
|
||
|
sinhf_neon [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.90e-07 1413205410 x1.00
|
||
|
coshf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1414417802 x1.00
|
||
|
coshf_c [-3.14, 3.14] 500000 9.54e-07 2.38e-05% 1.64e-07 1415426083 x1.00
|
||
|
coshf_neon [-3.14, 3.14] 500000 1.91e-06 2.22e-05% 1.68e-07 1416412636 x1.00
|
||
|
tanhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1417684273 x1.00
|
||
|
tanhf_c [-3.14, 3.14] 500000 1.20e-05 2.48e-01% 5.48e-06 1418659628 x1.00
|
||
|
tanhf_neon [-3.14, 3.14] 500000 2.38e-07 2.47e-01% 5.40e-08 1419650721 x1.00
|
||
|
expf [0.00, 10.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1420706074 x1.00
|
||
|
expf_c [0.00, 10.00] 500000 9.77e-03 6.15e-05% 1.64e-03 1421444150 x1.00
|
||
|
expf_neon [0.00, 10.00] 500000 9.77e-03 6.58e-05% 1.64e-03 1422203499 x1.00
|
||
|
logf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1423106698 x1.00
|
||
|
logf_c [1.00, 1000.00] 500000 6.20e-06 1.62e-02% 9.83e-07 1423735174 x1.00
|
||
|
logf_neon [1.00, 1000.00] 500000 7.63e-06 1.03e-02% 1.07e-06 1424434406 x1.00
|
||
|
log10f [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1425516892 x1.00
|
||
|
log10f_c [1.00, 1000.00] 500000 2.86e-06 6.68e-03% 4.79e-07 1426200368 x1.00
|
||
|
log10f_neon [1.00, 1000.00] 500000 3.34e-06 6.68e-03% 4.84e-07 1426966844 x1.00
|
||
|
floorf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1429081993 x1.00
|
||
|
floorf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1430839273 x1.00
|
||
|
floorf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1433474766 x1.00
|
||
|
ceilf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1435602956 x1.00
|
||
|
ceilf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1437403711 x1.00
|
||
|
ceilf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1440044970 x1.00
|
||
|
fabsf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1441265630 x1.00
|
||
|
fabsf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1442491716 x1.00
|
||
|
fabsf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1443680744 x1.00
|
||
|
sqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1444844144 x1.00
|
||
|
sqrtf_c [1.00, 1000.00] 500000 2.33e-04 1.06e-03% 8.69e-05 1445710342 x1.00
|
||
|
sqrtf_neon [1.00, 1000.00] 500000 7.63e-06 2.91e-05% 1.60e-06 1446544637 x1.00
|
||
|
invsqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1446995307 x1.00
|
||
|
invsqrtf_c [1.00, 1000.00] 500000 4.35e-06 4.78e-04% 2.00e-07 1447471977 x1.00
|
||
|
invsqrtf_neon [1.00, 1000.00] 500000 1.19e-07 2.12e-05% 4.81e-09 1447987675 x1.00
|
||
|
atan2f [0.10, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1449713108 x1.00
|
||
|
atan2f_c [0.10, 10.00] 10000 1.73e-04 2.23e-02% 0.00e+00 1451276575 x1.00
|
||
|
atan2f_neon [0.10, 10.00] 10000 1.67e-04 2.12e-02% 0.00e+00 1453093260 x1.00
|
||
|
powf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1458606663 x1.00
|
||
|
powf_c [1.00, 10.00] 10000 1.08e+05 4.37e-03% 0.00e+00 1461584933 x1.00
|
||
|
powf_neon [1.00, 10.00] 10000 1.36e+05 5.88e-03% 0.00e+00 1464702743 x1.00
|
||
|
fmodf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1466022029 x1.00
|
||
|
fmodf_c [1.00, 10.00] 10000 9.90e+00 8.06e-02% 0.00e+00 1467403015 x1.00
|
||
|
fmodf_neon [1.00, 10.00] 10000 9.97e+00 8.06e-02% 0.00e+00 1468767755 x1.00
|