mirror of
https://github.com/libretro/RetroArch.git
synced 2025-02-26 20:55:39 +00:00
Remove math-neon on frangarcj's recommendation - not used
This commit is contained in:
parent
b0ac722366
commit
f5564120a4
@ -2423,13 +2423,6 @@ ifneq ($(findstring Win32,$(OS)),)
|
||||
OBJ += led/drivers/led_win32_keyboard.o
|
||||
endif
|
||||
|
||||
ifeq ($(HAVE_MATH_NEON), 1)
|
||||
DEFINES += -DHAVE_MATH_NEON
|
||||
INCLUDE_DIRS += -I$(DEPS_DIR)/math-neon/source
|
||||
SOURCES := $(DEPS_DIR)/math-neon/source
|
||||
OBJ += $(patsubst %.c,%.o,$(foreach dir,$(SOURCES), $(wildcard $(dir)/*.c)))
|
||||
endif
|
||||
|
||||
ifeq ($(HAVE_VITAGLES), 1)
|
||||
DEFINES += -DHAVE_VITAGLES
|
||||
INCLUDE_DIRS += -I$(DEPS_DIR)/Pigs-In-A-Blanket/include
|
||||
|
@ -31,7 +31,6 @@ else
|
||||
HAVE_UPDATE_ASSETS := 1
|
||||
HAVE_ONLINE_UPDATER := 1
|
||||
HAVE_NEON := 1
|
||||
HAVE_MATH_NEON := 1
|
||||
HAVE_DSP_FILTER := 1
|
||||
HAVE_VIDEO_FILTER := 1
|
||||
HAVE_SCREENSHOTS := 1
|
||||
|
17
deps/math-neon/.gitattributes
vendored
17
deps/math-neon/.gitattributes
vendored
@ -1,17 +0,0 @@
|
||||
# Auto detect text files and perform LF normalization
|
||||
* text=auto
|
||||
|
||||
# Custom for Visual Studio
|
||||
*.cs diff=csharp
|
||||
|
||||
# Standard to msysgit
|
||||
*.doc diff=astextplain
|
||||
*.DOC diff=astextplain
|
||||
*.docx diff=astextplain
|
||||
*.DOCX diff=astextplain
|
||||
*.dot diff=astextplain
|
||||
*.DOT diff=astextplain
|
||||
*.pdf diff=astextplain
|
||||
*.PDF diff=astextplain
|
||||
*.rtf diff=astextplain
|
||||
*.RTF diff=astextplain
|
26
deps/math-neon/.gitignore
vendored
26
deps/math-neon/.gitignore
vendored
@ -1,26 +0,0 @@
|
||||
*.o
|
||||
*.a
|
||||
|
||||
# Windows thumbnail cache files
|
||||
Thumbs.db
|
||||
ehthumbs.db
|
||||
ehthumbs_vista.db
|
||||
|
||||
# Folder config file
|
||||
Desktop.ini
|
||||
|
||||
# Recycle Bin used on file shares
|
||||
$RECYCLE.BIN/
|
||||
|
||||
# Windows Installer files
|
||||
*.cab
|
||||
*.msi
|
||||
*.msm
|
||||
*.msp
|
||||
|
||||
# Windows shortcuts
|
||||
*.lnk
|
||||
|
||||
# =========================
|
||||
# Operating System Files
|
||||
# =========================
|
29
deps/math-neon/Makefile
vendored
29
deps/math-neon/Makefile
vendored
@ -1,29 +0,0 @@
|
||||
TARGET := libmathneon
|
||||
SOURCES := source
|
||||
|
||||
LIBS = -lc -lm -lSceGxm_stub -lSceDisplay_stub
|
||||
|
||||
CFILES := $(foreach dir,$(SOURCES), $(wildcard $(dir)/*.c))
|
||||
CGFILES := $(foreach dir,$(SHADERS), $(wildcard $(dir)/*.cg))
|
||||
HEADERS := $(CGFILES:.cg=.h)
|
||||
OBJS := $(CFILES:.c=.o)
|
||||
|
||||
PREFIX = arm-vita-eabi
|
||||
CC = $(PREFIX)-gcc
|
||||
AR = $(PREFIX)-gcc-ar
|
||||
CFLAGS = -g -Wl,-q -O2 -ffast-math -mtune=cortex-a9 -mfpu=neon -flto -ftree-vectorize
|
||||
ASFLAGS = $(CFLAGS)
|
||||
|
||||
all: $(TARGET).a
|
||||
|
||||
$(TARGET).a: $(OBJS)
|
||||
$(AR) -rc $@ $^
|
||||
|
||||
clean:
|
||||
@rm -rf $(TARGET).a $(TARGET).elf $(OBJS)
|
||||
|
||||
install: $(TARGET).a
|
||||
@mkdir -p $(VITASDK)/$(PREFIX)/lib/
|
||||
cp $(TARGET).a $(VITASDK)/$(PREFIX)/lib/
|
||||
@mkdir -p $(VITASDK)/$(PREFIX)/include/
|
||||
cp source/math_neon.h $(VITASDK)/$(PREFIX)/include/
|
168
deps/math-neon/README
vendored
168
deps/math-neon/README
vendored
@ -1,168 +0,0 @@
|
||||
|
||||
Library: MATH-NEON
|
||||
By: Lachlan Tychsen-Smith
|
||||
Licence: MIT (expat)
|
||||
=======================================================================================
|
||||
This project implements the cmath functions and some optimised matrix functions
|
||||
with the aim of increasing the floating point performance of ARM Cortex A-8
|
||||
based platforms. As well as implementing the functions in ARM NEON assembly,
|
||||
they sacrifice error checking and some accuracy to achieve better performance.
|
||||
|
||||
Function Errors:
|
||||
=======================================================================================
|
||||
The measurement and characterisations of the inaccuracies present within these
|
||||
functions is really a field within itself. For the benchmark i provide the
|
||||
maximum absolute, maximum relative and root mean squared error compared to the
|
||||
cmath implementations over the specified range. However these values can be
|
||||
misleading, especially for functions which quickly go to infinity. So its always a
|
||||
good idea to test it within your actual program. In general, this library will not
|
||||
be as accurate as cmath, however for many functions it is close enough to be
|
||||
negilible.
|
||||
|
||||
Notes:
|
||||
=======================================================================================
|
||||
- The *_c functions are c implementations of the *_neon code.
|
||||
- Like cmath, The errors present in the functions are very dependent on the
|
||||
range which your operating in. So you should test them first.
|
||||
- Look in the "math_neon.h" file for discriptions of the functions. In some
|
||||
function files there are also notes on the specific implementation.
|
||||
- The *_neon functions make certain assumptions about the location of arguments
|
||||
that is incompatible with inlining.
|
||||
|
||||
Contact:
|
||||
=======================================================================================
|
||||
Name: Lachlan Tychsen-Smith
|
||||
Email: lachlan.ts@gmail.com
|
||||
|
||||
PSVITA performances test results:
|
||||
|
||||
RUNFAST: Disabled
|
||||
------------------------------------------------------------------------------------------------------
|
||||
MATRIX FUNCTION TESTS
|
||||
------------------------------------------------------------------------------------------------------
|
||||
matmul2_c =
|
||||
|-14.56, 5.96|
|
||||
|-15.35, 10.50|
|
||||
matmul2_neon =
|
||||
|-14.56, 5.96|
|
||||
|-15.35, 10.50|
|
||||
matmul2: c=174924 neon=64490 rate=2.71
|
||||
matvec2_c = |-14.56, -15.35|
|
||||
matvec2_neon = |-14.56, -15.35|
|
||||
matvec2: c=88957 neon=58337 rate=1.52
|
||||
matmul3_c =
|
||||
|-21.39, -4.68, -1.74|
|
||||
|-8.66, -8.97, 1.83|
|
||||
|15.88, 0.30, -2.23|
|
||||
matmul3_neon =
|
||||
|-21.39, -4.68, -1.74|
|
||||
|-8.66, -8.97, 1.83|
|
||||
|15.88, 0.30, -2.23|
|
||||
matmul3: c=552486 neon=297268 rate=1.86
|
||||
matvec3_c = |-21.39, -8.66, 15.88|
|
||||
matvec3_neon = |-21.39, -8.66, 15.88|
|
||||
matvec3: c=184104 neon=128780 rate=1.43
|
||||
matmul4_c =
|
||||
|-13.65, -1.80, -12.92, 6.56|
|
||||
|-10.21, 9.47, 2.73, 14.79|
|
||||
|0.97, 11.69, -0.64, -12.87|
|
||||
|20.06, 6.77, 35.61, -0.02|
|
||||
matmul4_neon =
|
||||
|-13.65, -1.80, -12.92, 6.56|
|
||||
|-10.21, 9.47, 2.73, 14.79|
|
||||
|0.97, 11.69, -0.64, -12.87|
|
||||
|20.06, 6.77, 35.61, -0.02|
|
||||
matmul4: c=1315568 neon=254227 rate=5.17
|
||||
matvec4_c = |-13.65, -10.21, 0.97, 20.058556|
|
||||
matvec4_neon = |-13.65, -10.21, 0.97, 20.058556|
|
||||
matvec4: c=331712 neon=147196 rate=2.25
|
||||
|
||||
dot2_c = -10.903330
|
||||
dot2_neon = -10.903330
|
||||
dot2: c=230295 neon=168799 rate=1.36
|
||||
normalize2_c = [-0.74, 0.67]
|
||||
normalize2_neon = [-0.74, 0.67]
|
||||
normalize2: c=950716 neon=965780 rate=0.98
|
||||
|
||||
dot3_c = -4.226746
|
||||
dot3_neon = -4.226746
|
||||
dot3: c=306957 neon=337316 rate=0.91
|
||||
normalize3_c = [-0.69, 0.62, -0.38]
|
||||
normalize3_neon = [-0.69, 0.62, -0.38]
|
||||
normalize3: c=1180950 neon=1134557 rate=1.04
|
||||
cross3_c = [-9.67, -19.39, -14.24]
|
||||
cross3_neon = [-9.67, -19.39, -14.24]
|
||||
cross3: c=659558 neon=766896 rate=0.86
|
||||
|
||||
dot4_c = 2.782796
|
||||
dot4_neon = 2.782796
|
||||
dot4: c=414233 neon=276068 rate=1.50
|
||||
normalize4_c = [-0.59, 0.53, -0.32, -0.52]
|
||||
normalize4_neon = [-0.59, 0.53, -0.32, -0.52]
|
||||
normalize4: c=1364294 neon=1103327 rate=1.24
|
||||
|
||||
------------------------------------------------------------------------------------------------------
|
||||
CMATH FUNCTION TESTS
|
||||
------------------------------------------------------------------------------------------------------
|
||||
Function Range Number ABS Max Error REL Max Error RMS Error Time Rate
|
||||
------------------------------------------------------------------------------------------------------
|
||||
sinf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1394459996 x1.00
|
||||
sinf_c [-3.14, 3.14] 500000 7.75e-07 1.00e+02% 4.09e-07 1395128226 x1.00
|
||||
sinf_neon [-3.14, 3.14] 500000 8.34e-07 1.00e+02% 4.09e-07 1395853554 x1.00
|
||||
cosf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1396644271 x1.00
|
||||
cosf_c [-3.14, 3.14] 500000 7.75e-07 6.74e-01% 4.15e-07 1397360321 x1.00
|
||||
cosf_neon [-3.14, 3.14] 500000 8.34e-07 6.74e-01% 4.16e-07 1398126872 x1.00
|
||||
tanf [-0.79, 0.79] 500000 0.00e+00 0.00e+00% 0.00e+00 1398889596 x1.00
|
||||
tanf_c [-0.79, 0.79] 500000 2.98e-06 7.94e-04% 1.31e-06 1399704712 x1.00
|
||||
tanf_neon [-0.79, 0.79] 500000 1.91e-06 3.62e-04% 6.66e-07 1400612899 x1.00
|
||||
asinf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1401838993 x1.00
|
||||
asinf_c [-1.00, 1.00] 500000 5.54e-05 1.06e-02% nan 1402745512 x1.00
|
||||
asinf_neon [-1.00, 1.00] 500000 4.66e-05 8.90e-03% nan 1403967661 x1.00
|
||||
acosf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1405317842 x1.00
|
||||
acosf_c [-1.00, 1.00] 500000 5.56e-05 6.46e-03% nan 1406294753 x1.00
|
||||
acosf_neon [-1.00, 1.00] 500000 4.67e-05 6.35e-03% nan 1407598039 x1.00
|
||||
atanf [-1.00, 1.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1408314869 x1.00
|
||||
atanf_c [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 1408872421 x1.00
|
||||
atanf_neon [-1.00, 1.00] 500000 1.67e-04 2.12e-02% 7.40e-05 1409736652 x1.00
|
||||
sinhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1411101066 x1.00
|
||||
sinhf_c [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.85e-07 1412173492 x1.00
|
||||
sinhf_neon [-3.14, 3.14] 500000 1.91e-06 1.52e-01% 1.90e-07 1413205410 x1.00
|
||||
coshf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1414417802 x1.00
|
||||
coshf_c [-3.14, 3.14] 500000 9.54e-07 2.38e-05% 1.64e-07 1415426083 x1.00
|
||||
coshf_neon [-3.14, 3.14] 500000 1.91e-06 2.22e-05% 1.68e-07 1416412636 x1.00
|
||||
tanhf [-3.14, 3.14] 500000 0.00e+00 0.00e+00% 0.00e+00 1417684273 x1.00
|
||||
tanhf_c [-3.14, 3.14] 500000 1.20e-05 2.48e-01% 5.48e-06 1418659628 x1.00
|
||||
tanhf_neon [-3.14, 3.14] 500000 2.38e-07 2.47e-01% 5.40e-08 1419650721 x1.00
|
||||
expf [0.00, 10.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1420706074 x1.00
|
||||
expf_c [0.00, 10.00] 500000 9.77e-03 6.15e-05% 1.64e-03 1421444150 x1.00
|
||||
expf_neon [0.00, 10.00] 500000 9.77e-03 6.58e-05% 1.64e-03 1422203499 x1.00
|
||||
logf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1423106698 x1.00
|
||||
logf_c [1.00, 1000.00] 500000 6.20e-06 1.62e-02% 9.83e-07 1423735174 x1.00
|
||||
logf_neon [1.00, 1000.00] 500000 7.63e-06 1.03e-02% 1.07e-06 1424434406 x1.00
|
||||
log10f [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1425516892 x1.00
|
||||
log10f_c [1.00, 1000.00] 500000 2.86e-06 6.68e-03% 4.79e-07 1426200368 x1.00
|
||||
log10f_neon [1.00, 1000.00] 500000 3.34e-06 6.68e-03% 4.84e-07 1426966844 x1.00
|
||||
floorf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1429081993 x1.00
|
||||
floorf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1430839273 x1.00
|
||||
floorf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1433474766 x1.00
|
||||
ceilf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1435602956 x1.00
|
||||
ceilf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1437403711 x1.00
|
||||
ceilf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1440044970 x1.00
|
||||
fabsf [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1441265630 x1.00
|
||||
fabsf_c [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1442491716 x1.00
|
||||
fabsf_neon [1.00, 1000.00] 5000000 0.00e+00 0.00e+00% 0.00e+00 1443680744 x1.00
|
||||
sqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1444844144 x1.00
|
||||
sqrtf_c [1.00, 1000.00] 500000 2.33e-04 1.06e-03% 8.69e-05 1445710342 x1.00
|
||||
sqrtf_neon [1.00, 1000.00] 500000 7.63e-06 2.91e-05% 1.60e-06 1446544637 x1.00
|
||||
invsqrtf [1.00, 1000.00] 500000 0.00e+00 0.00e+00% 0.00e+00 1446995307 x1.00
|
||||
invsqrtf_c [1.00, 1000.00] 500000 4.35e-06 4.78e-04% 2.00e-07 1447471977 x1.00
|
||||
invsqrtf_neon [1.00, 1000.00] 500000 1.19e-07 2.12e-05% 4.81e-09 1447987675 x1.00
|
||||
atan2f [0.10, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1449713108 x1.00
|
||||
atan2f_c [0.10, 10.00] 10000 1.73e-04 2.23e-02% 0.00e+00 1451276575 x1.00
|
||||
atan2f_neon [0.10, 10.00] 10000 1.67e-04 2.12e-02% 0.00e+00 1453093260 x1.00
|
||||
powf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1458606663 x1.00
|
||||
powf_c [1.00, 10.00] 10000 1.08e+05 4.37e-03% 0.00e+00 1461584933 x1.00
|
||||
powf_neon [1.00, 10.00] 10000 1.36e+05 5.88e-03% 0.00e+00 1464702743 x1.00
|
||||
fmodf [1.00, 10.00] 10000 0.00e+00 0.00e+00% 0.00e+00 1466022029 x1.00
|
||||
fmodf_c [1.00, 10.00] 10000 9.90e+00 8.06e-02% 0.00e+00 1467403015 x1.00
|
||||
fmodf_neon [1.00, 10.00] 10000 9.97e+00 8.06e-02% 0.00e+00 1468767755 x1.00
|
689
deps/math-neon/math_debug.c
vendored
689
deps/math-neon/math_debug.c
vendored
@ -1,689 +0,0 @@
|
||||
/*
|
||||
Math-NEON: Neon Optimised Math Library based on cmath
|
||||
Contact: lachlan.ts@gmail.com
|
||||
Copyright (C) 2009 Lachlan Tychsen - Smith aka Adventus
|
||||
|
||||
This library is free software; you can redistribute it and/or
|
||||
modify it under the terms of the GNU Lesser General Public
|
||||
License as published by the Free Software Foundation; either
|
||||
version 3 of the License, or (at your option) any later version.
|
||||
|
||||
This library is distributed in the hope that it will be useful,
|
||||
but WITHOUT ANY WARRANTY; without even the implied warranty of
|
||||
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
|
||||
Lesser General Public License for more details.
|
||||
|
||||
You should have received a copy of the GNU Lesser General Public
|
||||
License along with this library; if not, write to the Free
|
||||
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
|
||||
*/
|
||||
|
||||
|
||||
#include <math_neon.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdarg.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <math.h>
|
||||
#include <time.h>
|
||||
#ifdef WIN32
|
||||
#include <time.h>
|
||||
#else
|
||||
#include <sys/time.h>
|
||||
#include <sys/resource.h>
|
||||
#endif
|
||||
|
||||
#define randf() (rand() / (RAND_MAX + 1.0f))
|
||||
|
||||
void LOG(const char *format, ...) {
|
||||
__gnuc_va_list arg;
|
||||
va_start(arg, format);
|
||||
char msg[512];
|
||||
vsprintf(msg, format, arg);
|
||||
va_end(arg);
|
||||
|
||||
FILE *log = fopen("ux0:/data/mathneon.log", "a+");
|
||||
if (log != NULL) {
|
||||
fwrite(msg, 1, strlen(msg), log);
|
||||
fclose(log);
|
||||
}
|
||||
}
|
||||
|
||||
struct test1_s {
|
||||
const char* name;
|
||||
float (*func)(float); //the function
|
||||
float (*bench)(float); //the function to benchmark against.
|
||||
float rng0, rng1;
|
||||
int num;
|
||||
float emaxabs;
|
||||
float xmaxabs;
|
||||
float emaxrel;
|
||||
float xmaxrel;
|
||||
float erms;
|
||||
int time; //time to execute num functions;
|
||||
};
|
||||
|
||||
struct test2_s {
|
||||
const char* name;
|
||||
float (*func)(float, float); //the function
|
||||
float (*bench)(float, float); //the function to benchmark against.
|
||||
float rng0, rng1;
|
||||
int num;
|
||||
float emaxabs;
|
||||
float xmaxabs;
|
||||
float emaxrel;
|
||||
float xmaxrel;
|
||||
float erms;
|
||||
int time; //time to execute num functions;
|
||||
};
|
||||
|
||||
|
||||
float invsqrtf(float x){
|
||||
return (1.0f / sqrtf(x));
|
||||
}
|
||||
|
||||
typedef struct test1_s test1_t;
|
||||
typedef struct test2_s test2_t;
|
||||
|
||||
test1_t test1[51] =
|
||||
{
|
||||
{"sinf ", sinf, sinf, -M_PI, M_PI, 500000},
|
||||
{"sinf_c ", sinf_c, sinf, -M_PI, M_PI, 500000},
|
||||
{"sinf_neon ", sinf_neon, sinf, -M_PI, M_PI, 500000},
|
||||
|
||||
{"cosf ", cosf, cosf, -M_PI, M_PI, 500000},
|
||||
{"cosf_c ", cosf_c, cosf, -M_PI, M_PI, 500000},
|
||||
{"cosf_neon ", cosf_neon, cosf, -M_PI, M_PI, 500000},
|
||||
|
||||
{"tanf ", tanf, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0},
|
||||
{"tanf_c ", tanf_c, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0},
|
||||
{"tanf_neon ", tanf_neon, tanf, -M_PI_4, M_PI_4, 500000, 0, 0, 0},
|
||||
|
||||
{"asinf ", asinf, asinf, -1, 1, 500000, 0, 0, 0},
|
||||
{"asinf_c ", asinf_c, asinf, -1, 1, 500000, 0, 0, 0},
|
||||
{"asinf_neon ", asinf_neon, asinf, -1, 1, 500000, 0, 0, 0},
|
||||
|
||||
{"acosf ", acosf, acosf, -1, 1, 500000, 0, 0, 0},
|
||||
{"acosf_c ", acosf_c, acosf, -1, 1, 500000, 0, 0, 0},
|
||||
{"acosf_neon ", acosf_neon, acosf, -1, 1, 500000, 0, 0, 0},
|
||||
|
||||
{"atanf ", atanf, atanf, -1, 1, 500000, 0, 0, 0},
|
||||
{"atanf_c ", atanf_c, atanf, -1, 1, 500000, 0, 0, 0},
|
||||
{"atanf_neon ", atanf_neon, atanf, -1, 1, 500000, 0, 0, 0},
|
||||
|
||||
{"sinhf ", sinhf, sinhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"sinhf_c ", sinhf_c, sinhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"sinhf_neon ", sinhf_neon, sinhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
|
||||
{"coshf ", coshf, coshf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"coshf_c ", coshf_c, coshf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"coshf_neon ", coshf_neon, coshf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
|
||||
{"tanhf ", tanhf, tanhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"tanhf_c ", tanhf_c, tanhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
{"tanhf_neon ", tanhf_neon, tanhf, -M_PI, M_PI, 500000, 0, 0, 0},
|
||||
|
||||
{"expf ", expf, expf, 0, 10, 500000, 0, 0, 0},
|
||||
{"expf_c ", expf_c, expf, 0, 10, 500000, 0, 0, 0},
|
||||
{"expf_neon ", expf_neon, expf, 0, 10, 500000, 0, 0, 0},
|
||||
|
||||
{"logf ", logf, logf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"logf_c ", logf_c, logf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"logf_neon ", logf_neon, logf, 1, 1000, 500000, 0, 0, 0},
|
||||
|
||||
{"log10f ", log10f, log10f, 1, 1000, 500000, 0, 0, 0},
|
||||
{"log10f_c ", log10f_c, log10f, 1, 1000, 500000, 0, 0, 0},
|
||||
{"log10f_neon ", log10f_neon,log10f, 1, 1000, 500000, 0, 0, 0},
|
||||
|
||||
{"floorf ", floorf, floorf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"floorf_c ", floorf_c, floorf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"floorf_neon", floorf_neon,floorf, 1, 1000, 5000000, 0, 0, 0},
|
||||
|
||||
{"ceilf ", ceilf, ceilf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"ceilf_c ", ceilf_c, ceilf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"ceilf_neon", ceilf_neon, ceilf, 1, 1000, 5000000, 0, 0, 0},
|
||||
|
||||
{"fabsf ", fabsf, fabsf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"fabsf_c ", fabsf_c, fabsf, 1, 1000, 5000000, 0, 0, 0},
|
||||
{"fabsf_neon", fabsf_neon, fabsf, 1, 1000, 5000000, 0, 0, 0},
|
||||
|
||||
{"sqrtf ", sqrtf, sqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"sqrtf_c ", sqrtf_c, sqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"sqrtf_neon ", sqrtf_neon, sqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
|
||||
{"invsqrtf ", invsqrtf, invsqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"invsqrtf_c ", invsqrtf_c, invsqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
{"invsqrtf_neon ", invsqrtf_neon, invsqrtf, 1, 1000, 500000, 0, 0, 0},
|
||||
};
|
||||
|
||||
test2_t test2[9] =
|
||||
{
|
||||
{"atan2f ", atan2f, atan2f, 0.1, 10, 10000, 0, 0, 0},
|
||||
{"atan2f_c ", atan2f_c, atan2f, 0.1, 10, 10000, 0, 0, 0},
|
||||
{"atan2f_neon ", atan2f_neon,atan2f, 0.1, 10, 10000, 0, 0, 0},
|
||||
|
||||
{"powf ", powf, powf, 1, 10, 10000, 0, 0, 0},
|
||||
{"powf_c ", powf_c, powf, 1, 10, 10000, 0, 0, 0},
|
||||
{"powf_neon ", powf_neon, powf, 1, 10, 10000, 0, 0, 0},
|
||||
|
||||
{"fmodf ", fmodf, fmodf, 1, 10, 10000, 0, 0, 0},
|
||||
{"fmodf_c ", fmodf_c, fmodf, 1, 10, 10000, 0, 0, 0},
|
||||
{"fmodf_neon ", fmodf_neon, fmodf, 1, 10, 10000, 0, 0, 0},
|
||||
|
||||
};
|
||||
|
||||
|
||||
void
|
||||
test_mathfunc1(test1_t *tst)
|
||||
{
|
||||
|
||||
float x;
|
||||
float dx = (tst->rng1 - tst->rng0) / ((float)tst->num);
|
||||
#ifndef WIN32
|
||||
struct rusage ru;
|
||||
#endif
|
||||
|
||||
tst->emaxabs = tst->xmaxabs = 0;
|
||||
tst->emaxrel = tst->xmaxrel = 0;
|
||||
tst->erms = 0;
|
||||
for(x = tst->rng0; x < tst->rng1 ; x += dx){
|
||||
float r = (tst->func)((float)x);
|
||||
float rr = (tst->bench)((float)x);
|
||||
float dr = fabs(r - rr);
|
||||
float drr = dr * (100.0f / rr);
|
||||
tst->erms += dr*dr;
|
||||
if (dr > tst->emaxabs){
|
||||
tst->emaxabs = dr;
|
||||
tst->xmaxabs = x;
|
||||
}
|
||||
if (drr > tst->emaxrel){
|
||||
tst->emaxrel = drr;
|
||||
tst->xmaxrel = x;
|
||||
}
|
||||
}
|
||||
tst->erms = sqrt(tst->erms / ((float) tst->num));
|
||||
|
||||
#ifdef WIN32
|
||||
tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000);
|
||||
#else
|
||||
tst->time = sceKernelGetSystemTimeWide();
|
||||
#endif
|
||||
|
||||
for(x = tst->rng0; x < tst->rng1 ; x += dx){
|
||||
(tst->func)((float)x);
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time;
|
||||
#else
|
||||
tst->time = sceKernelGetSystemTimeWide();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void
|
||||
test_mathfunc2(test2_t *tst)
|
||||
{
|
||||
float x, y;
|
||||
float rng = tst->rng1 - tst->rng0;
|
||||
float d = (rng * rng) / ((float) tst->num);
|
||||
#ifndef WIN32
|
||||
struct rusage ru;
|
||||
#endif
|
||||
|
||||
tst->emaxabs = tst->xmaxabs = 0;
|
||||
tst->emaxrel = tst->xmaxrel = 0;
|
||||
for(y = (tst->rng0); y < (tst->rng1) ; y += d){
|
||||
for(x = (tst->rng0); x < (tst->rng1); x += d){
|
||||
float r = (tst->func)((float)x, y);
|
||||
float rr = (tst->bench)((float)x, y);
|
||||
float dr = fabs(r - rr);
|
||||
float drr = dr * (100.0f / rr);
|
||||
if (dr > tst->emaxabs){
|
||||
tst->emaxabs = dr;
|
||||
tst->xmaxabs = x;
|
||||
}
|
||||
if (drr > tst->emaxrel && fabsf(rr) > 0.0001){
|
||||
tst->emaxrel = drr;
|
||||
tst->xmaxrel = x;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) ;
|
||||
#else
|
||||
tst->time = sceKernelGetSystemTimeWide();
|
||||
#endif
|
||||
|
||||
for(y = tst->rng0; y < tst->rng1 ; y += d){
|
||||
for(x = tst->rng0; x < tst->rng1 ; x += d){
|
||||
(tst->func)((float)x, (float)y);
|
||||
}
|
||||
}
|
||||
|
||||
#ifdef WIN32
|
||||
tst->time = (1000 * clock()) / (CLOCKS_PER_SEC / 1000) - tst->time;
|
||||
#else
|
||||
tst->time = sceKernelGetSystemTimeWide();
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
void test_vectorfunc()
|
||||
{
|
||||
float v0[4], v1[4], d[4];
|
||||
|
||||
for(int i=0;i<4;i++)
|
||||
{
|
||||
v0[i] = 10*randf() - 5;
|
||||
v1[i] = 10*randf() - 5;
|
||||
d[i] = 10*randf() - 5;
|
||||
}
|
||||
|
||||
int testnum = 5000000;
|
||||
struct rusage ru;
|
||||
int v2t[3], v3t[3], v4t[3];
|
||||
float r;
|
||||
|
||||
LOG("\n");
|
||||
|
||||
//dot 2
|
||||
v2t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot2_c(v0, v1);
|
||||
};
|
||||
v2t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot2_neon(v0, v1);
|
||||
};
|
||||
v2t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
r = dot2_c(v0, v1);
|
||||
LOG("dot2_c = %f\n", r);
|
||||
r = dot2_neon(v0, v1);
|
||||
LOG("dot2_neon = %f\n", r);
|
||||
|
||||
LOG("dot2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1],
|
||||
(float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1]));
|
||||
|
||||
//normalize 2
|
||||
v2t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize2_c(v0, d);
|
||||
};
|
||||
v2t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize2_neon(v0, d);
|
||||
};
|
||||
v2t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
|
||||
normalize2_c(v0, d);
|
||||
LOG("normalize2_c = [%.2f, %.2f]\n", d[0], d[1]);
|
||||
normalize2_neon(v0, d);
|
||||
LOG("normalize2_neon = [%.2f, %.2f]\n", d[0], d[1]);
|
||||
|
||||
LOG("normalize2: c=%i \t neon=%i \t rate=%.2f \n", v2t[1] - v2t[0], v2t[2] - v2t[1],
|
||||
(float)(v2t[1] - v2t[0]) / (float)(v2t[2] - v2t[1]));
|
||||
LOG("\n");
|
||||
|
||||
|
||||
//dot 3
|
||||
v3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot3_c(v0, v1);
|
||||
};
|
||||
v3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot3_neon(v0, v1);
|
||||
};
|
||||
v3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
r = dot3_c(v0, v1);
|
||||
LOG("dot3_c = %f\n", r);
|
||||
r = dot3_neon(v0, v1);
|
||||
LOG("dot3_neon = %f\n", r);
|
||||
|
||||
LOG("dot3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1],
|
||||
(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
|
||||
|
||||
//normalize 3
|
||||
v3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize3_c(v0, d);
|
||||
};
|
||||
v3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize3_neon(v0, d);
|
||||
};
|
||||
v3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
|
||||
normalize3_c(v0, d);
|
||||
LOG("normalize3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
|
||||
normalize3_neon(v0, d);
|
||||
LOG("normalize3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
|
||||
|
||||
LOG("normalize3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1],
|
||||
(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
|
||||
|
||||
//cross 3
|
||||
v3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
cross3_c(v0, v1, d);
|
||||
};
|
||||
v3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
cross3_neon(v0, v1, d);
|
||||
};
|
||||
v3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
|
||||
cross3_c(v0, v1, d);
|
||||
LOG("cross3_c = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
|
||||
cross3_neon(v0, v1, d);
|
||||
LOG("cross3_neon = [%.2f, %.2f, %.2f]\n", d[0], d[1], d[2]);
|
||||
|
||||
LOG("cross3: c=%i \t neon=%i \t rate=%.2f \n", v3t[1] - v3t[0], v3t[2] - v3t[1],
|
||||
(float)(v3t[1] - v3t[0]) / (float)(v3t[2] - v3t[1]));
|
||||
LOG("\n");
|
||||
|
||||
|
||||
//dot 4
|
||||
v4t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot4_c(v0, v1);
|
||||
};
|
||||
v4t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
r = dot4_neon(v0, v1);
|
||||
};
|
||||
v4t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
r = dot4_c(v0, v1);
|
||||
LOG("dot4_c = %f\n", r);
|
||||
r = dot4_neon(v0, v1);
|
||||
LOG("dot4_neon = %f\n", r);
|
||||
|
||||
LOG("dot4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1],
|
||||
(float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1]));
|
||||
|
||||
//normalize 4
|
||||
v4t[0] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize4_c(v0, d);
|
||||
};
|
||||
v4t[1] = sceKernelGetSystemTimeWide();
|
||||
for(int i=0;i < testnum; i++)
|
||||
{
|
||||
normalize4_neon(v0, d);
|
||||
};
|
||||
v4t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
|
||||
normalize4_c(v0, d);
|
||||
LOG("normalize4_c = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]);
|
||||
normalize4_neon(v0, d);
|
||||
LOG("normalize4_neon = [%.2f, %.2f, %.2f, %.2f]\n", d[0], d[1], d[2], d[3]);
|
||||
|
||||
LOG("normalize4: c=%i \t neon=%i \t rate=%.2f \n", v4t[1] - v4t[0], v4t[2] - v4t[1],
|
||||
(float)(v4t[1] - v4t[0]) / (float)(v4t[2] - v4t[1]));
|
||||
LOG("\n");
|
||||
|
||||
|
||||
}
|
||||
|
||||
|
||||
|
||||
void test_matrixfunc()
|
||||
{
|
||||
float m0[16], m1[16], m2[16];
|
||||
int m2t[3], m3t[3], m4t[3];
|
||||
|
||||
int i;
|
||||
int testnum = 1000000;
|
||||
struct rusage ru;
|
||||
|
||||
for(int i=0;i<16;i++)
|
||||
{
|
||||
m0[i] = 10.0f * randf() - 5.0f;
|
||||
m1[i] = 10.0f * randf() - 5.0f;
|
||||
m2[i] = 10.0f * randf() - 5.0f;
|
||||
}
|
||||
|
||||
|
||||
//matmul2
|
||||
m2t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul2_c(m0, m1, m2);
|
||||
}
|
||||
m2t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul2_neon(m0, m1, m2);
|
||||
}
|
||||
m2t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
matmul2_c(m0, m1, m2);
|
||||
LOG("matmul2_c = \n");
|
||||
LOG("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]);
|
||||
LOG("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]);
|
||||
|
||||
matmul2_neon(m0, m1, m2);
|
||||
LOG("matmul2_neon = \n");
|
||||
LOG("\t\t\t|%.2f, %.2f|\n", m2[0], m2[2]);
|
||||
LOG("\t\t\t|%.2f, %.2f|\n", m2[1], m2[3]);
|
||||
|
||||
LOG("matmul2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1],
|
||||
(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
|
||||
|
||||
|
||||
//matvec2
|
||||
m2t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec2_c(m0, m1, m2);
|
||||
}
|
||||
m2t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec2_neon(m0, m1, m2);
|
||||
}
|
||||
m2t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec2_c(m0, m1, m2);
|
||||
LOG("matvec2_c = |%.2f, %.2f|\n", m2[0], m2[1]);
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec2_neon(m0, m1, m2);
|
||||
LOG("matvec2_neon = |%.2f, %.2f|\n", m2[0], m2[1]);
|
||||
|
||||
LOG("matvec2: c=%i \t neon=%i \t rate=%.2f \n", m2t[1] - m2t[0], m2t[2] - m2t[1],
|
||||
(float)(m2t[1] - m2t[0]) / (float)(m2t[2] - m2t[1]));
|
||||
|
||||
//MAT3
|
||||
m3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul3_c(m0, m1, m2);
|
||||
}
|
||||
m3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul3_neon(m0, m1, m2);
|
||||
}
|
||||
m3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 9*sizeof(float));
|
||||
matmul3_c(m0, m1, m2);
|
||||
LOG("matmul3_c =\n");
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]);
|
||||
|
||||
memset(m2, 0, 9*sizeof(float));
|
||||
matmul3_neon(m0, m1, m2);
|
||||
LOG("matmul3_neon =\n");
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[0], m2[3], m2[6]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[1], m2[4], m2[7]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f|\n", m2[2], m2[5], m2[8]);
|
||||
|
||||
LOG("matmul3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1],
|
||||
(float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1]));
|
||||
|
||||
//matvec3
|
||||
m3t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec3_c(m0, m1, m2);
|
||||
}
|
||||
m3t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec3_neon(m0, m1, m2);
|
||||
}
|
||||
m3t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec3_c(m0, m1, m2);
|
||||
LOG("matvec3_c = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]);
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec3_neon(m0, m1, m2);
|
||||
LOG("matvec3_neon = |%.2f, %.2f, %.2f|\n", m2[0], m2[1], m2[2]);
|
||||
|
||||
LOG("matvec3: c=%i \t neon=%i \t rate=%.2f \n", m3t[1] - m3t[0], m3t[2] - m3t[1],
|
||||
(float)(m3t[1] - m3t[0]) / (float)(m3t[2] - m3t[1]));
|
||||
|
||||
//MAT4
|
||||
m4t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul4_c(m0, m1, m2);
|
||||
}
|
||||
m4t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matmul4_neon(m0, m1, m2);
|
||||
}
|
||||
m4t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 16*sizeof(float));
|
||||
matmul4_c(m0, m1, m2);
|
||||
LOG("matmul4_c =\n");
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]);
|
||||
|
||||
memset(m2, 0, 16*sizeof(float));
|
||||
matmul4_neon(m0, m1, m2);
|
||||
LOG("matmul4_neon =\n");
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[0], m2[4], m2[8], m2[12]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[1], m2[5], m2[9], m2[13]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[2], m2[6], m2[10], m2[14]);
|
||||
LOG("\t\t\t|%.2f, %.2f, %.2f, %.2f|\n", m2[3], m2[7], m2[11], m2[15]);
|
||||
|
||||
LOG("matmul4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1],
|
||||
(float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1]));
|
||||
|
||||
//matvec4
|
||||
m4t[0] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec4_c(m0, m1, m2);
|
||||
}
|
||||
m4t[1] = sceKernelGetSystemTimeWide();
|
||||
for(i = 0; i < testnum; i++){
|
||||
matvec4_neon(m0, m1, m2);
|
||||
}
|
||||
m4t[2] = sceKernelGetSystemTimeWide();
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec4_c(m0, m1, m2);
|
||||
LOG("matvec4_c = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]);
|
||||
|
||||
memset(m2, 0, 4*sizeof(float));
|
||||
matvec4_neon(m0, m1, m2);
|
||||
LOG("matvec4_neon = |%.2f, %.2f, %.2f, %f|\n", m2[0], m2[1], m2[2], m2[3]);
|
||||
|
||||
LOG("matvec4: c=%i \t neon=%i \t rate=%.2f \n", m4t[1] - m4t[0], m4t[2] - m4t[1],
|
||||
(float)(m4t[1] - m4t[0]) / (float)(m4t[2] - m4t[1]));
|
||||
|
||||
|
||||
}
|
||||
|
||||
int main(int argc, char** argv)
|
||||
{
|
||||
|
||||
int i, ii;
|
||||
#if 1
|
||||
LOG("RUNFAST: Disabled \n");
|
||||
#else
|
||||
LOG("RUNFAST: Enabled \n");
|
||||
enable_runfast();
|
||||
#endif
|
||||
srand(time(NULL));
|
||||
|
||||
#if 1
|
||||
//test single argument functions:
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
LOG("MATRIX FUNCTION TESTS \n");
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
|
||||
test_matrixfunc();
|
||||
test_vectorfunc();
|
||||
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
LOG("CMATH FUNCTION TESTS \n");
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
LOG("Function\tRange\t\tNumber\tABS Max Error\tREL Max Error\tRMS Error\tTime\tRate\n");
|
||||
LOG("------------------------------------------------------------------------------------------------------\n");
|
||||
for(i = 0; i < 51; i++){
|
||||
test_mathfunc1(&test1[i]);
|
||||
|
||||
ii = i - (i % 3);
|
||||
LOG("%s\t", test1[i].name);
|
||||
LOG("[%.2f, %.2f]\t", test1[i].rng0, test1[i].rng1);
|
||||
LOG("%i\t", test1[i].num);
|
||||
LOG("%.2e\t", test1[i].emaxabs);
|
||||
LOG("%.2e%%\t", test1[i].emaxrel);
|
||||
LOG("%.2e\t", test1[i].erms);
|
||||
LOG("%i\t", test1[i].time);
|
||||
LOG("x%.2f\t", (float)test1[ii].time / test1[i].time);
|
||||
LOG("\n");
|
||||
}
|
||||
for(i = 0; i < 9; i++){
|
||||
test_mathfunc2(&test2[i]);
|
||||
|
||||
ii = i - (i % 3);
|
||||
|
||||
LOG("%s\t", test2[i].name);
|
||||
LOG("[%.2f, %.2f]\t", test2[i].rng0, test2[i].rng1);
|
||||
LOG("%i\t", test2[i].num);
|
||||
LOG("%.2e\t", test2[i].emaxabs);
|
||||
LOG("%.2e%%\t", test2[i].emaxrel);
|
||||
LOG("%.2e\t", test2[i].erms);
|
||||
LOG("%i\t", test2[i].time);
|
||||
LOG("x%.2f\t", (float)test2[ii].time / test2[i].time);
|
||||
LOG("\n");
|
||||
}
|
||||
|
||||
#else
|
||||
|
||||
|
||||
float x = 0;
|
||||
for(x = -M_PI_2; x < M_PI_2; x+= 0.01)
|
||||
{
|
||||
LOG("x=%.2f\t in=%.2f\t c=%.2f\t neon=%.2f \n", x, sinhf(x), sinhf_c(x), sinhf_neon(x));
|
||||
}
|
||||
|
||||
#endif
|
||||
|
||||
return 0;
|
||||
}
|
67
deps/math-neon/source/math_acosf.c
vendored
67
deps/math-neon/source/math_acosf.c
vendored
@ -1,67 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
/*
|
||||
Test func : acosf(x)
|
||||
Test Range: -1.0 < x < 1.0
|
||||
Peak Error: ~0.005%
|
||||
RMS Error: ~0.001%
|
||||
*/
|
||||
|
||||
const float __acosf_pi_2 = M_PI_2;
|
||||
|
||||
float acosf_c(float x)
|
||||
{
|
||||
return __acosf_pi_2 - asinf_c(x);
|
||||
}
|
||||
|
||||
|
||||
float acosf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asinf_neon_hfp(x);
|
||||
asm volatile (
|
||||
"vdup.f32 d1, %0 \n\t" //d1 = {pi/2, pi/2};
|
||||
"vsub.f32 d0, d1, d0 \n\t" //d0 = d1 - d0;
|
||||
::"r"(__acosf_pi_2):
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float acosf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
acosf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return acosf_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
183
deps/math-neon/source/math_asinf.c
vendored
183
deps/math-neon/source/math_asinf.c
vendored
@ -1,183 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
/*
|
||||
Test func : asinf(x)
|
||||
Test Range: -1.0 < x < 1.0
|
||||
Peak Error: ~0.005%
|
||||
RMS Error: ~0.001%
|
||||
*/
|
||||
|
||||
|
||||
const float __asinf_lut[4] = {
|
||||
0.105312459675071, //p7
|
||||
0.169303418571894, //p3
|
||||
0.051599985887214, //p5
|
||||
0.999954835104825 //p1
|
||||
};
|
||||
|
||||
const float __asinf_pi_2 = M_PI_2;
|
||||
|
||||
float asinf_c(float x)
|
||||
{
|
||||
|
||||
float a, b, c, d, r, ax;
|
||||
int m;
|
||||
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} xx;
|
||||
|
||||
ax = fabs(x);
|
||||
d = 0.5;
|
||||
d = d - ax*0.5;
|
||||
|
||||
//fast invsqrt approx
|
||||
xx.f = d;
|
||||
xx.i = 0x5F3759DF - (xx.i >> 1); //VRSQRTE
|
||||
c = d * xx.f;
|
||||
b = (3.0f - c * xx.f) * 0.5; //VRSQRTS
|
||||
xx.f = xx.f * b;
|
||||
c = d * xx.f;
|
||||
b = (3.0f - c * xx.f) * 0.5;
|
||||
xx.f = xx.f * b;
|
||||
|
||||
//fast inverse approx
|
||||
d = xx.f;
|
||||
m = 0x3F800000 - (xx.i & 0x7F800000);
|
||||
xx.i = xx.i + m;
|
||||
xx.f = 1.41176471f - 0.47058824f * xx.f;
|
||||
xx.i = xx.i + m;
|
||||
b = 2.0 - xx.f * d;
|
||||
xx.f = xx.f * b;
|
||||
b = 2.0 - xx.f * d;
|
||||
xx.f = xx.f * b;
|
||||
|
||||
//if |x|>0.5 -> x = sqrt((1-x)/2)
|
||||
xx.f = xx.f - ax;
|
||||
a = (ax > 0.5f);
|
||||
d = __asinf_pi_2 * a;
|
||||
c = 1.0f - 3.0f * a;
|
||||
ax = ax + xx.f * a;
|
||||
|
||||
//polynomial evaluation
|
||||
xx.f = ax * ax;
|
||||
a = (__asinf_lut[0] * ax) * xx.f + (__asinf_lut[2] * ax);
|
||||
b = (__asinf_lut[1] * ax) * xx.f + (__asinf_lut[3] * ax);
|
||||
xx.f = xx.f * xx.f;
|
||||
r = b + a * xx.f;
|
||||
r = d + c * r;
|
||||
|
||||
a = r + r;
|
||||
b = (x < 0.0f);
|
||||
r = r - a * b;
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
float asinf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x, x};
|
||||
"vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2};
|
||||
"vmov.f32 d6, d0 \n\t" //d6 = d0;
|
||||
"vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ;
|
||||
|
||||
"vmov.f32 d5, #0.5 \n\t" //d5 = 0.5;
|
||||
"vmls.f32 d5, d0, d5 \n\t" //d5 = d5 - d0*d5;
|
||||
|
||||
//fast invsqrt approx
|
||||
"vmov.f32 d1, d5 \n\t" //d1 = d5
|
||||
"vrsqrte.f32 d5, d5 \n\t" //d5 = ~ 1.0 / sqrt(d5)
|
||||
"vmul.f32 d2, d5, d1 \n\t" //d2 = d5 * d1
|
||||
"vrsqrts.f32 d3, d2, d5 \n\t" //d3 = (3 - d5 * d2) / 2
|
||||
"vmul.f32 d5, d5, d3 \n\t" //d5 = d5 * d3
|
||||
"vmul.f32 d2, d5, d1 \n\t" //d2 = d5 * d1
|
||||
"vrsqrts.f32 d3, d2, d5 \n\t" //d3 = (3 - d5 * d3) / 2
|
||||
"vmul.f32 d5, d5, d3 \n\t" //d5 = d5 * d3
|
||||
|
||||
//fast reciporical approximation
|
||||
"vrecpe.f32 d1, d5 \n\t" //d1 = ~ 1 / d5;
|
||||
"vrecps.f32 d2, d1, d5 \n\t" //d2 = 2.0 - d1 * d5;
|
||||
"vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2;
|
||||
"vrecps.f32 d2, d1, d5 \n\t" //d2 = 2.0 - d1 * d5;
|
||||
"vmul.f32 d5, d1, d2 \n\t" //d5 = d1 * d2;
|
||||
|
||||
//if |x| > 0.5 -> ax = sqrt((1-ax)/2), r = pi/2
|
||||
"vsub.f32 d5, d0, d5 \n\t" //d5 = d0 - d5;
|
||||
"vmov.f32 d2, #0.5 \n\t" //d2 = 0.5;
|
||||
"vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2);
|
||||
"vmov.f32 d1, #3.0 \n\t" //d5 = 3.0;
|
||||
"vshr.u32 d3, #31 \n\t" //d3 = d3 >> 31;
|
||||
"vmov.f32 d16, #1.0 \n\t" //d16 = 1.0;
|
||||
"vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3;
|
||||
"vmls.f32 d0, d5, d3[0] \n\t" //d0 = d0 - d5 * d3[0];
|
||||
"vmul.f32 d7, d4, d3[0] \n\t" //d7 = d5 * d4;
|
||||
"vmls.f32 d16, d1, d3[0] \n\t" //d16 = d16 - d1 * d3;
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2}
|
||||
"vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1}
|
||||
"vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4}
|
||||
"vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x}
|
||||
"vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3}
|
||||
"vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7}
|
||||
|
||||
"vmla.f32 d7, d1, d16 \n\t" //d7 = d7 + d1*d16
|
||||
|
||||
"vadd.f32 d2, d7, d7 \n\t" //d2 = d7 + d7
|
||||
"vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0)
|
||||
"vshr.u32 d3, #31 \n\t" //d3 = d3 >> 31;
|
||||
"vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3
|
||||
"vmls.f32 d7, d2, d3[0] \n\t" //d7 = d7 - d2 * d3[0];
|
||||
|
||||
"vmov.f32 s0, s15 \n\t" //s0 = s3
|
||||
|
||||
:: "r"(__asinf_lut), "r"(__asinf_pi_2)
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
float asinf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
asinf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return asinf_c(x);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
170
deps/math-neon/source/math_atan2f.c
vendored
170
deps/math-neon/source/math_atan2f.c
vendored
@ -1,170 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __atan2f_lut[4] = {
|
||||
-0.0443265554792128, //p7
|
||||
-0.3258083974640975, //p3
|
||||
+0.1555786518463281, //p5
|
||||
+0.9997878412794807 //p1
|
||||
};
|
||||
|
||||
const float __atan2f_pi_2 = M_PI_2;
|
||||
|
||||
float atan2f_c(float y, float x)
|
||||
{
|
||||
float a, b, c, r, xx;
|
||||
int m;
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} xinv;
|
||||
|
||||
//fast inverse approximation (2x newton)
|
||||
xx = fabs(x);
|
||||
xinv.f = xx;
|
||||
m = 0x3F800000 - (xinv.i & 0x7F800000);
|
||||
xinv.i = xinv.i + m;
|
||||
xinv.f = 1.41176471f - 0.47058824f * xinv.f;
|
||||
xinv.i = xinv.i + m;
|
||||
b = 2.0 - xinv.f * xx;
|
||||
xinv.f = xinv.f * b;
|
||||
b = 2.0 - xinv.f * xx;
|
||||
xinv.f = xinv.f * b;
|
||||
|
||||
c = fabs(y * xinv.f);
|
||||
|
||||
//fast inverse approximation (2x newton)
|
||||
xinv.f = c;
|
||||
m = 0x3F800000 - (xinv.i & 0x7F800000);
|
||||
xinv.i = xinv.i + m;
|
||||
xinv.f = 1.41176471f - 0.47058824f * xinv.f;
|
||||
xinv.i = xinv.i + m;
|
||||
b = 2.0 - xinv.f * c;
|
||||
xinv.f = xinv.f * b;
|
||||
b = 2.0 - xinv.f * c;
|
||||
xinv.f = xinv.f * b;
|
||||
|
||||
//if |x| > 1.0 -> ax = -1/ax, r = pi/2
|
||||
xinv.f = xinv.f + c;
|
||||
a = (c > 1.0f);
|
||||
c = c - a * xinv.f;
|
||||
r = a * __atan2f_pi_2;
|
||||
|
||||
//polynomial evaluation
|
||||
xx = c * c;
|
||||
a = (__atan2f_lut[0] * c) * xx + (__atan2f_lut[2] * c);
|
||||
b = (__atan2f_lut[1] * c) * xx + (__atan2f_lut[3] * c);
|
||||
xx = xx * xx;
|
||||
r = r + a * xx;
|
||||
r = r + b;
|
||||
|
||||
//determine quadrant and test for small x.
|
||||
b = M_PI;
|
||||
b = b - 2.0f * r;
|
||||
r = r + (x < 0.0f) * b;
|
||||
b = (fabs(x) < 0.000001f);
|
||||
c = !b;
|
||||
r = c * r;
|
||||
r = r + __atan2f_pi_2 * b;
|
||||
b = r + r;
|
||||
r = r - (y < 0.0f) * b;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
float atan2f_neon_hfp(float y, float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vdup.f32 d17, d0[1] \n\t" //d17 = {x, x};
|
||||
"vdup.f32 d16, d0[0] \n\t" //d16 = {y, y};
|
||||
|
||||
//1.0 / x
|
||||
"vrecpe.f32 d18, d17 \n\t" //d16 = ~ 1 / d1;
|
||||
"vrecps.f32 d19, d18, d17 \n\t" //d17 = 2.0 - d16 * d1;
|
||||
"vmul.f32 d18, d18, d19 \n\t" //d16 = d16 * d17;
|
||||
"vrecps.f32 d19, d18, d17 \n\t" //d17 = 2.0 - d16 * d1;
|
||||
"vmul.f32 d18, d18, d19 \n\t" //d16 = d16 * d17;
|
||||
|
||||
//y * (1.0 /x)
|
||||
"vmul.f32 d0, d16, d18 \n\t" //d0 = d16 * d18;
|
||||
|
||||
|
||||
"vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2};
|
||||
"vmov.f32 d6, d0 \n\t" //d6 = d0;
|
||||
"vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ;
|
||||
|
||||
//fast reciporical approximation
|
||||
"vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2;
|
||||
|
||||
//if |x| > 1.0 -> ax = 1/ax, r = pi/2
|
||||
"vadd.f32 d1, d1, d0 \n\t" //d1 = d1 + d0;
|
||||
"vmov.f32 d2, #1.0 \n\t" //d2 = 1.0;
|
||||
"vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2);
|
||||
"vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3;
|
||||
"vmls.f32 d0, d1, d3 \n\t" //d0 = d0 - d1 * d3;
|
||||
"vmul.f32 d7, d3, d4 \n\t" //d7 = d3 * d4;
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2}
|
||||
"vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1}
|
||||
"vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4}
|
||||
"vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x}
|
||||
"vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3}
|
||||
"vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7}
|
||||
"vadd.f32 d1, d1, d7 \n\t" //d1 = d1 + d7
|
||||
|
||||
"vadd.f32 d2, d1, d1 \n\t" //d2 = d1 + d1
|
||||
"vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0)
|
||||
"vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3
|
||||
"vmls.f32 d1, d3, d2 \n\t" //d1 = d1 - d2 * d3;
|
||||
|
||||
"vmov.f32 s0, s3 \n\t" //s0 = s3
|
||||
|
||||
:: "r"(__atan2f_lut), "r"(__atan2f_pi_2)
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
float atan2f_neon_sfp(float x, float y)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
asm volatile ("vmov.f32 s1, r1 \n\t");
|
||||
atan2f_neon_hfp(x, y);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return atan2f_c(y, x);
|
||||
#endif
|
||||
};
|
149
deps/math-neon/source/math_atanf.c
vendored
149
deps/math-neon/source/math_atanf.c
vendored
@ -1,149 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __atanf_lut[4] = {
|
||||
-0.0443265554792128, //p7
|
||||
-0.3258083974640975, //p3
|
||||
+0.1555786518463281, //p5
|
||||
+0.9997878412794807 //p1
|
||||
};
|
||||
|
||||
const float __atanf_pi_2 = M_PI_2;
|
||||
|
||||
float atanf_c(float x)
|
||||
{
|
||||
|
||||
float a, b, r, xx;
|
||||
int m;
|
||||
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} xinv, ax;
|
||||
|
||||
ax.f = fabs(x);
|
||||
|
||||
//fast inverse approximation (2x newton)
|
||||
xinv.f = ax.f;
|
||||
m = 0x3F800000 - (xinv.i & 0x7F800000);
|
||||
xinv.i = xinv.i + m;
|
||||
xinv.f = 1.41176471f - 0.47058824f * xinv.f;
|
||||
xinv.i = xinv.i + m;
|
||||
b = 2.0 - xinv.f * ax.f;
|
||||
xinv.f = xinv.f * b;
|
||||
b = 2.0 - xinv.f * ax.f;
|
||||
xinv.f = xinv.f * b;
|
||||
|
||||
//if |x| > 1.0 -> ax = -1/ax, r = pi/2
|
||||
xinv.f = xinv.f + ax.f;
|
||||
a = (ax.f > 1.0f);
|
||||
ax.f = ax.f - a * xinv.f;
|
||||
r = a * __atanf_pi_2;
|
||||
|
||||
//polynomial evaluation
|
||||
xx = ax.f * ax.f;
|
||||
a = (__atanf_lut[0] * ax.f) * xx + (__atanf_lut[2] * ax.f);
|
||||
b = (__atanf_lut[1] * ax.f) * xx + (__atanf_lut[3] * ax.f);
|
||||
xx = xx * xx;
|
||||
b = b + a * xx;
|
||||
r = r + b;
|
||||
|
||||
//if x < 0 -> r = -r
|
||||
a = 2 * r;
|
||||
b = (x < 0.0f);
|
||||
r = r - a * b;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
float atanf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x, x};
|
||||
|
||||
"vdup.f32 d4, %1 \n\t" //d4 = {pi/2, pi/2};
|
||||
"vmov.f32 d6, d0 \n\t" //d6 = d0;
|
||||
"vabs.f32 d0, d0 \n\t" //d0 = fabs(d0) ;
|
||||
|
||||
//fast reciporical approximation
|
||||
"vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2;
|
||||
|
||||
|
||||
//if |x| > 1.0 -> ax = -1/ax, r = pi/2
|
||||
"vadd.f32 d1, d1, d0 \n\t" //d1 = d1 + d0;
|
||||
"vmov.f32 d2, #1.0 \n\t" //d2 = 1.0;
|
||||
"vcgt.f32 d3, d0, d2 \n\t" //d3 = (d0 > d2);
|
||||
"vshr.u32 d3, #31 \n\t" //d3 = (d0 > d2);
|
||||
"vcvt.f32.u32 d3, d3 \n\t" //d5 = (float) d3;
|
||||
"vmls.f32 d0, d1, d3[0] \n\t" //d0 = d0 - d1 * d3[0];
|
||||
"vmul.f32 d7, d4, d3[0] \n\t" //d7 = d5 * d4;
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d2, d0, d0 \n\t" //d2 = d0*d0 = {ax^2, ax^2}
|
||||
"vld1.32 {d4, d5}, [%0] \n\t" //d4 = {p7, p3}, d5 = {p5, p1}
|
||||
"vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4}
|
||||
"vmul.f32 q0, q2, d0[0] \n\t" //q0 = q2 * d0[0] = {p7x, p3x, p5x, p1x}
|
||||
"vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2[0] = {p5x + p7x^3, p1x + p3x^3}
|
||||
"vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d1[0] = {..., p1x + p3x^3 + p5x^5 + p7x^7}
|
||||
"vadd.f32 d1, d1, d7 \n\t" //d1 = d1 + d7
|
||||
|
||||
"vadd.f32 d2, d1, d1 \n\t" //d2 = d1 + d1
|
||||
"vclt.f32 d3, d6, #0 \n\t" //d3 = (d6 < 0)
|
||||
"vshr.u32 d3, #31 \n\t" //d3 = (d0 > d2);
|
||||
"vcvt.f32.u32 d3, d3 \n\t" //d3 = (float) d3
|
||||
"vmls.f32 d1, d3, d2 \n\t" //d1 = d1 - d2 * d3;
|
||||
|
||||
"vmov.f32 s0, s3 \n\t" //s0 = s3
|
||||
|
||||
:: "r"(__atanf_lut), "r"(__atanf_pi_2)
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "d6", "d7"
|
||||
);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
float atanf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vdup.f32 d0, r0 \n\t");
|
||||
atanf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return atanf_c(x);
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
|
71
deps/math-neon/source/math_ceilf.c
vendored
71
deps/math-neon/source/math_ceilf.c
vendored
@ -1,71 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Assumes the floating point value |x| < 2147483648
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
float ceilf_c(float x)
|
||||
{
|
||||
int n;
|
||||
float r;
|
||||
n = (int) x;
|
||||
r = (float) n;
|
||||
r = r + (x > r);
|
||||
return r;
|
||||
}
|
||||
|
||||
float ceilf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0;
|
||||
"vcvt.f32.s32 d1, d1 \n\t" //d1 = (float) d1;
|
||||
"vcgt.f32 d0, d0, d1 \n\t" //d0 = (d0 > d1);
|
||||
"vshr.u32 d0, #31 \n\t" //d0 = d0 >> 31;
|
||||
"vcvt.f32.u32 d0, d0 \n\t" //d0 = (float) d0;
|
||||
"vadd.f32 d0, d1, d0 \n\t" //d0 = d1 + d0;
|
||||
|
||||
::: "d0", "d1"
|
||||
);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
float ceilf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
ceilf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return ceilf_c(x);
|
||||
#endif
|
||||
};
|
||||
|
||||
|
50
deps/math-neon/source/math_cosf.c
vendored
50
deps/math-neon/source/math_cosf.c
vendored
@ -1,50 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
float cosf_c(float x)
|
||||
{
|
||||
return sinf_c(x + M_PI_2);
|
||||
}
|
||||
|
||||
float cosf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
float xx = x + M_PI_2;
|
||||
return sinf_neon_hfp(xx);
|
||||
#endif
|
||||
}
|
||||
|
||||
float cosf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vdup.f32 d0, r0 \n\t");
|
||||
cosf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return cosf_c(x);
|
||||
#endif
|
||||
};
|
||||
|
120
deps/math-neon/source/math_coshf.c
vendored
120
deps/math-neon/source/math_coshf.c
vendored
@ -1,120 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __coshf_rng[2] = {
|
||||
1.442695041f,
|
||||
0.693147180f
|
||||
};
|
||||
|
||||
const float __coshf_lut[16] = {
|
||||
0.00019578093328483123, //p7
|
||||
0.00019578093328483123, //p7
|
||||
0.0014122663401803872, //p6
|
||||
0.0014122663401803872, //p6
|
||||
0.008336936973260111, //p5
|
||||
0.008336936973260111, //p5
|
||||
0.04165989275009526, //p4
|
||||
0.04165989275009526, //p4
|
||||
0.16666570253074878, //p3
|
||||
0.16666570253074878, //p3
|
||||
0.5000006143673624, //p2
|
||||
0.5000006143673624, //p2
|
||||
1.000000059694879, //p1
|
||||
1.000000059694879, //p1
|
||||
0.9999999916728642, //p0
|
||||
0.9999999916728642 //p0
|
||||
};
|
||||
|
||||
|
||||
float coshf_c(float x)
|
||||
{
|
||||
float a, b, xx;
|
||||
xx = -x;
|
||||
a = expf_c(x);
|
||||
b = expf_c(xx);
|
||||
a = a * 0.5f;
|
||||
a = a + 0.5f * b;
|
||||
return a;
|
||||
}
|
||||
|
||||
|
||||
float coshf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}
|
||||
"fnegs s1, s1 \n\t" //s1 = -s1
|
||||
|
||||
//Range Reduction:
|
||||
"vld1.32 d2, [%0] \n\t" //d2 = {invrange, range}
|
||||
"vld1.32 {d16, d17}, [%1]! \n\t"
|
||||
"vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0]
|
||||
"vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6
|
||||
"vld1.32 {d18}, [%1]! \n\t"
|
||||
"vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6
|
||||
"vld1.32 {d19}, [%1]! \n\t"
|
||||
"vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1]
|
||||
"vld1.32 {d20}, [%1]! \n\t"
|
||||
|
||||
//polynomial:
|
||||
"vmla.f32 d17, d16, d0 \n\t" //d17 = d17 + d16 * d0;
|
||||
"vld1.32 {d21}, [%1]! \n\t"
|
||||
"vmla.f32 d18, d17, d0 \n\t" //d18 = d18 + d17 * d0;
|
||||
"vld1.32 {d22}, [%1]! \n\t"
|
||||
"vmla.f32 d19, d18, d0 \n\t" //d19 = d19 + d18 * d0;
|
||||
"vld1.32 {d23}, [%1]! \n\t"
|
||||
"vmla.f32 d20, d19, d0 \n\t" //d20 = d20 + d19 * d0;
|
||||
"vmla.f32 d21, d20, d0 \n\t" //d21 = d21 + d20 * d0;
|
||||
"vmla.f32 d22, d21, d0 \n\t" //d22 = d22 + d21 * d0;
|
||||
"vmla.f32 d23, d22, d0 \n\t" //d23 = d23 + d22 * d0;
|
||||
|
||||
//multiply by 2 ^ m
|
||||
"vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23
|
||||
"vadd.i32 d0, d23, d6 \n\t" //d0 = d22 + d6
|
||||
|
||||
"vdup.f32 d2, d0[1] \n\t" //d2 = s1
|
||||
"vmov.f32 d1, #0.5 \n\t" //d1 = 0.5
|
||||
"vadd.f32 d0, d0, d2 \n\t" //d0 = d0 + d2
|
||||
"vmul.f32 d0, d1 \n\t" //d0 = d0 * d1
|
||||
|
||||
:: "r"(__coshf_rng), "r"(__coshf_lut)
|
||||
: "d0", "d1", "q1", "q2", "d6"
|
||||
);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
float coshf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
coshf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return coshf_c(x);
|
||||
#endif
|
||||
}
|
135
deps/math-neon/source/math_expf.c
vendored
135
deps/math-neon/source/math_expf.c
vendored
@ -1,135 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Based on:
|
||||
|
||||
e ^ x = (1+m) * (2^n)
|
||||
x = log(1+m) + n * log(2)
|
||||
n = (int) (x * 1.0 / log(2))
|
||||
(1+m) = e ^ (x - n * log(2))
|
||||
(1+m) = Poly(x - n * log(2))
|
||||
|
||||
where Poly(x) is the Minimax approximation of e ^ x over the
|
||||
range [-Log(2), Log(2)]
|
||||
|
||||
Test func : expf(x)
|
||||
Test Range: 0 < x < 50
|
||||
Peak Error: ~0.00024%
|
||||
RMS Error: ~0.00007%
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __expf_rng[2] = {
|
||||
1.442695041f,
|
||||
0.693147180f
|
||||
};
|
||||
|
||||
const float __expf_lut[8] = {
|
||||
0.9999999916728642, //p0
|
||||
0.04165989275009526, //p4
|
||||
0.5000006143673624, //p2
|
||||
0.0014122663401803872, //p6
|
||||
1.000000059694879, //p1
|
||||
0.008336936973260111, //p5
|
||||
0.16666570253074878, //p3
|
||||
0.00019578093328483123 //p7
|
||||
};
|
||||
|
||||
float expf_c(float x)
|
||||
{
|
||||
float a, b, c, d, xx;
|
||||
int m;
|
||||
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} r;
|
||||
|
||||
//Range Reduction:
|
||||
m = (int) (x * __expf_rng[0]);
|
||||
x = x - ((float) m) * __expf_rng[1];
|
||||
|
||||
//Taylor Polynomial (Estrins)
|
||||
a = (__expf_lut[4] * x) + (__expf_lut[0]);
|
||||
b = (__expf_lut[6] * x) + (__expf_lut[2]);
|
||||
c = (__expf_lut[5] * x) + (__expf_lut[1]);
|
||||
d = (__expf_lut[7] * x) + (__expf_lut[3]);
|
||||
xx = x * x;
|
||||
a = a + b * xx;
|
||||
c = c + d * xx;
|
||||
xx = xx* xx;
|
||||
r.f = a + c * xx;
|
||||
|
||||
//multiply by 2 ^ m
|
||||
m = m << 23;
|
||||
r.i = r.i + m;
|
||||
|
||||
return r.f;
|
||||
}
|
||||
|
||||
float expf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}
|
||||
|
||||
//Range Reduction:
|
||||
"vld1.32 d2, [%0] \n\t" //d2 = {invrange, range}
|
||||
"vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0]
|
||||
"vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6
|
||||
"vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6
|
||||
"vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1]
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2}
|
||||
"vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
|
||||
"vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0]
|
||||
"vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0]
|
||||
"vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4}
|
||||
"vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1]
|
||||
|
||||
//multiply by 2 ^ m
|
||||
"vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23
|
||||
"vadd.i32 d0, d2, d6 \n\t" //d0 = d2 + d6
|
||||
|
||||
:: "r"(__expf_rng), "r"(__expf_lut)
|
||||
: "d0", "d1", "q1", "q2", "d6"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float expf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
expf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return expf_c(x);
|
||||
#endif
|
||||
};
|
||||
|
58
deps/math-neon/source/math_fabsf.c
vendored
58
deps/math-neon/source/math_fabsf.c
vendored
@ -1,58 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
|
||||
float fabsf_c(float x)
|
||||
{
|
||||
union {
|
||||
int i;
|
||||
float f;
|
||||
} xx;
|
||||
|
||||
xx.f = x;
|
||||
xx.i = xx.i & 0x7FFFFFFF;
|
||||
return xx.f;
|
||||
}
|
||||
|
||||
float fabsf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"fabss s0, s0 \n\t" //s0 = fabs(s0)
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float fabsf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"bic r0, r0, #0x80000000 \n\t" //r0 = r0 & ~(1 << 31)
|
||||
);
|
||||
#else
|
||||
return fabsf_c(x);
|
||||
#endif
|
||||
}
|
66
deps/math-neon/source/math_floorf.c
vendored
66
deps/math-neon/source/math_floorf.c
vendored
@ -1,66 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Assumes the floating point value |x| < 2147483648
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
float floorf_c(float x)
|
||||
{
|
||||
int n;
|
||||
float r;
|
||||
n = (int) x;
|
||||
r = (float) n;
|
||||
r = r - (r > x);
|
||||
return r;
|
||||
}
|
||||
|
||||
float floorf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0;
|
||||
"vcvt.f32.s32 d1, d1 \n\t" //d1 = (float) d1;
|
||||
"vcgt.f32 d0, d1, d0 \n\t" //d0 = (d1 > d0);
|
||||
"vshr.u32 d0, #31 \n\t" //d0 = d0 >> 31;
|
||||
"vcvt.f32.u32 d0, d0 \n\t" //d0 = (float) d0;
|
||||
"vsub.f32 d0, d1, d0 \n\t" //d0 = d1 - d0;
|
||||
::: "d0", "d1"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float floorf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
floorf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return floorf_c(x);
|
||||
#endif
|
||||
};
|
100
deps/math-neon/source/math_fmodf.c
vendored
100
deps/math-neon/source/math_fmodf.c
vendored
@ -1,100 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Assumes the floating point value |x / y| < 2,147,483,648
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
float fmodf_c(float x, float y)
|
||||
{
|
||||
int n;
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} yinv;
|
||||
float a;
|
||||
|
||||
//fast reciporical approximation (4x Newton)
|
||||
yinv.f = y;
|
||||
n = 0x3F800000 - (yinv.i & 0x7F800000);
|
||||
yinv.i = yinv.i + n;
|
||||
yinv.f = 1.41176471f - 0.47058824f * yinv.f;
|
||||
yinv.i = yinv.i + n;
|
||||
a = 2.0 - yinv.f * y;
|
||||
yinv.f = yinv.f * a;
|
||||
a = 2.0 - yinv.f * y;
|
||||
yinv.f = yinv.f * a;
|
||||
a = 2.0 - yinv.f * y;
|
||||
yinv.f = yinv.f * a;
|
||||
a = 2.0 - yinv.f * y;
|
||||
yinv.f = yinv.f * a;
|
||||
|
||||
n = (int)(x * yinv.f);
|
||||
x = x - ((float)n) * y;
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
float fmodf_neon_hfp(float x, float y)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vdup.f32 d1, d0[1] \n\t" //d1[0] = y
|
||||
"vdup.f32 d0, d0[0] \n\t" //d1[0] = y
|
||||
|
||||
//fast reciporical approximation
|
||||
"vrecpe.f32 d2, d1 \n\t" //d2 = ~1.0 / d1
|
||||
"vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1;
|
||||
"vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3;
|
||||
"vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1;
|
||||
"vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3;
|
||||
"vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1;
|
||||
"vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3;
|
||||
"vrecps.f32 d3, d2, d1 \n\t" //d3 = 2.0 - d2 * d1;
|
||||
"vmul.f32 d2, d2, d3 \n\t" //d2 = d2 * d3;
|
||||
|
||||
"vmul.f32 d2, d2, d0 \n\t" //d2 = d2 * d0;
|
||||
"vcvt.s32.f32 d2, d2 \n\t" //d2 = (int) d2;
|
||||
"vcvt.f32.s32 d2, d2 \n\t" //d2 = (float) d2;
|
||||
"vmls.f32 d0, d1, d2 \n\t" //d0 = d0 - d1 * d2;
|
||||
|
||||
::: "d0", "d1", "d2", "d3"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
float fmodf_neon_sfp(float x, float y)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
asm volatile ("vmov.f32 s1, r1 \n\t");
|
||||
fmodf_neon_hfp(x, y);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return fmodf_c(x,y);
|
||||
#endif
|
||||
};
|
79
deps/math-neon/source/math_invsqrtf.c
vendored
79
deps/math-neon/source/math_invsqrtf.c
vendored
@ -1,79 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
float invsqrtf_c(float x)
|
||||
{
|
||||
|
||||
float b, c;
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} a;
|
||||
|
||||
//fast invsqrt approx
|
||||
a.f = x;
|
||||
a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5; //VRSQRTS
|
||||
a.f = a.f * b;
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5;
|
||||
a.f = a.f * b;
|
||||
|
||||
return a.f;
|
||||
}
|
||||
|
||||
float invsqrtf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vmov.f32 d1, d0 \n\t" //d1 = d0
|
||||
"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4
|
||||
|
||||
::: "d0", "d1", "d2", "d3"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float invsqrtf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
invsqrtf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return invsqrtf_c(x);
|
||||
#endif
|
||||
};
|
||||
|
67
deps/math-neon/source/math_ldexpf.c
vendored
67
deps/math-neon/source/math_ldexpf.c
vendored
@ -1,67 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
float ldexpf_c(float m, int e)
|
||||
{
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} r;
|
||||
r.f = m;
|
||||
r.i += (e << 23);
|
||||
return r.f;
|
||||
}
|
||||
|
||||
float ldexpf_neon_hfp(float m, int e)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
float r;
|
||||
asm volatile (
|
||||
"lsl r0, r0, #23 \n\t" //r0 = r0 << 23
|
||||
"vdup.i32 d1, r0 \n\t" //d1 = {r0, r0}
|
||||
"vadd.i32 d0, d0, d1 \n\t" //d0 = d0 + d1
|
||||
::: "d0", "d1"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float ldexpf_neon_sfp(float m, int e)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
float r;
|
||||
asm volatile (
|
||||
"lsl r1, r1, #23 \n\t" //r1 = r1 << 23
|
||||
"vdup.f32 d0, r0 \n\t" //d0 = {r0, r0}
|
||||
"vdup.i32 d1, r1 \n\t" //d1 = {r1, r1}
|
||||
"vadd.i32 d0, d0, d1 \n\t" //d0 = d0 + d1
|
||||
"vmov.f32 r0, s0 \n\t" //r0 = s0
|
||||
::: "d0", "d1"
|
||||
);
|
||||
#else
|
||||
return ldexpf_c(m,e);
|
||||
#endif
|
||||
}
|
135
deps/math-neon/source/math_log10f.c
vendored
135
deps/math-neon/source/math_log10f.c
vendored
@ -1,135 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Based on:
|
||||
|
||||
log10(x) = log10((1+m) * (2^n))
|
||||
log(x) = n * log10(2) + log10(1 + m)
|
||||
log(1+m) = Poly(1+m)
|
||||
|
||||
where Poly(x) is the Minimax approximation of log10(x) over the
|
||||
range [1, 2]
|
||||
|
||||
Test func : log10f(x)
|
||||
Test Range: 1 < x < 10000
|
||||
Peak Error: ~0.000040%
|
||||
RMS Error: ~0.000008%
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __log10f_rng = 0.3010299957f;
|
||||
|
||||
const float __log10f_lut[8] = {
|
||||
-0.99697286229624, //p0
|
||||
-1.07301643912502, //p4
|
||||
-2.46980061535534, //p2
|
||||
-0.07176870463131, //p6
|
||||
2.247870219989470, //p1
|
||||
0.366547581117400, //p5
|
||||
1.991005185100089, //p3
|
||||
0.006135635201050, //p7
|
||||
};
|
||||
|
||||
float log10f_c(float x)
|
||||
{
|
||||
float a, b, c, d, xx;
|
||||
int m;
|
||||
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} r;
|
||||
|
||||
//extract exponent
|
||||
r.f = x;
|
||||
m = (r.i >> 23);
|
||||
m = m - 127;
|
||||
r.i = r.i - (m << 23);
|
||||
|
||||
//Taylor Polynomial (Estrins)
|
||||
xx = r.f * r.f;
|
||||
a = (__log10f_lut[4] * r.f) + (__log10f_lut[0]);
|
||||
b = (__log10f_lut[6] * r.f) + (__log10f_lut[2]);
|
||||
c = (__log10f_lut[5] * r.f) + (__log10f_lut[1]);
|
||||
d = (__log10f_lut[7] * r.f) + (__log10f_lut[3]);
|
||||
a = a + b * xx;
|
||||
c = c + d * xx;
|
||||
xx = xx * xx;
|
||||
r.f = a + c * xx;
|
||||
|
||||
//add exponent
|
||||
r.f = r.f + ((float) m) * __log10f_rng;
|
||||
|
||||
return r.f;
|
||||
}
|
||||
|
||||
float log10f_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x,x};
|
||||
|
||||
//extract exponent
|
||||
"vmov.i32 d2, #127 \n\t" //d2 = 127;
|
||||
"vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23;
|
||||
"vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2;
|
||||
"vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23;
|
||||
"vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1;
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2}
|
||||
"vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
|
||||
"vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0]
|
||||
"vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0]
|
||||
"vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4}
|
||||
"vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1]
|
||||
|
||||
//add exponent
|
||||
"vdup.32 d7, %0 \n\t" //d7 = {rng, rng}
|
||||
"vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6
|
||||
"vmla.f32 d2, d6, d7 \n\t" //d2 = d2 + d6 * d7
|
||||
|
||||
"vmov.f32 s0, s4 \n\t" //s0 = s4
|
||||
|
||||
:: "r"(__log10f_rng), "r"(__log10f_lut)
|
||||
: "d0", "d1", "q1", "q2", "d6", "d7"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
float log10f_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
log10f_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return log10f_c(x);
|
||||
#endif
|
||||
};
|
135
deps/math-neon/source/math_logf.c
vendored
135
deps/math-neon/source/math_logf.c
vendored
@ -1,135 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Based on:
|
||||
|
||||
log(x) = log((1+m) * (2^n))
|
||||
log(x) = n * log(2) + log(1 + m)
|
||||
log(1+m) = Poly(1+m)
|
||||
|
||||
where Poly(x) is the Minimax approximation of log(x) over the
|
||||
range [1, 2]
|
||||
|
||||
Test func : logf(x)
|
||||
Test Range: 1 < x < 10000
|
||||
Peak Error: ~0.000601%
|
||||
RMS Error: ~0.000005%
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __logf_rng = 0.693147180f;
|
||||
|
||||
const float __logf_lut[8] = {
|
||||
-2.295614848256274, //p0
|
||||
-2.470711633419806, //p4
|
||||
-5.686926051100417, //p2
|
||||
-0.165253547131978, //p6
|
||||
+5.175912446351073, //p1
|
||||
+0.844006986174912, //p5
|
||||
+4.584458825456749, //p3
|
||||
+0.014127821926000 //p7
|
||||
};
|
||||
|
||||
float logf_c(float x)
|
||||
{
|
||||
float a, b, c, d, xx;
|
||||
int m;
|
||||
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} r;
|
||||
|
||||
//extract exponent
|
||||
r.f = x;
|
||||
m = (r.i >> 23);
|
||||
m = m - 127;
|
||||
r.i = r.i - (m << 23);
|
||||
|
||||
//Taylor Polynomial (Estrins)
|
||||
xx = r.f * r.f;
|
||||
a = (__logf_lut[4] * r.f) + (__logf_lut[0]);
|
||||
b = (__logf_lut[6] * r.f) + (__logf_lut[2]);
|
||||
c = (__logf_lut[5] * r.f) + (__logf_lut[1]);
|
||||
d = (__logf_lut[7] * r.f) + (__logf_lut[3]);
|
||||
a = a + b * xx;
|
||||
c = c + d * xx;
|
||||
xx = xx * xx;
|
||||
r.f = a + c * xx;
|
||||
|
||||
//add exponent
|
||||
r.f = r.f + ((float) m) * __logf_rng;
|
||||
|
||||
return r.f;
|
||||
}
|
||||
|
||||
float logf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x,x};
|
||||
|
||||
//extract exponent
|
||||
"vmov.i32 d2, #127 \n\t" //d2 = 127;
|
||||
"vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23;
|
||||
"vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2;
|
||||
"vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23;
|
||||
"vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1;
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2}
|
||||
"vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
|
||||
"vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0]
|
||||
"vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0]
|
||||
"vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4}
|
||||
"vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1]
|
||||
|
||||
//add exponent
|
||||
"vdup.32 d7, %0 \n\t" //d7 = {rng, rng}
|
||||
"vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6
|
||||
"vmla.f32 d2, d6, d7 \n\t" //d2 = d2 + d6 * d7
|
||||
|
||||
"vmov.f32 s0, s4 \n\t" //s0 = s4
|
||||
|
||||
:: "r"(__logf_rng), "r"(__logf_lut)
|
||||
: "d0", "d1", "q1", "q2", "d6", "d7"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float logf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
logf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return logf_c(x);
|
||||
#endif
|
||||
};
|
||||
|
95
deps/math-neon/source/math_mat2.c
vendored
95
deps/math-neon/source/math_mat2.c
vendored
@ -1,95 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Matrices are specified in column major format:
|
||||
|
||||
| a c |
|
||||
| b d |
|
||||
|
||||
therefore m[2] = c
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
//matrix matrix multipication. d = m0 * m1;
|
||||
void
|
||||
matmul2_c(float m0[4], float m1[4], float d[4])
|
||||
{
|
||||
d[0] = m0[0]*m1[0] + m0[2]*m1[1];
|
||||
d[1] = m0[1]*m1[0] + m0[3]*m1[1];
|
||||
d[2] = m0[0]*m1[2] + m0[2]*m1[3];
|
||||
d[3] = m0[1]*m1[2] + m0[3]*m1[3];
|
||||
}
|
||||
|
||||
void
|
||||
matmul2_neon(float m0[4], float m1[4], float d[4])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d0, d1}, [%0] \n\t" //Q1 = m0
|
||||
"vld1.32 {d2, d3}, [%1] \n\t" //Q2 = m1
|
||||
|
||||
"vmul.f32 d4, d0, d2[0] \n\t" //D4 = D0*D2[0]
|
||||
"vmul.f32 d5, d0, d3[0] \n\t" //D5 = D0*D3[0]
|
||||
"vmla.f32 d4, d1, d2[1] \n\t" //D4 += D1*D2[1]
|
||||
"vmla.f32 d5, d1, d3[1] \n\t" //D5 += D1*D3[1]
|
||||
|
||||
"vst1.32 {d4, d5}, [%2] \n\t" //Q4 = m+12
|
||||
:: "r"(m0), "r"(m1), "r"(d)
|
||||
: "q0", "q1", "q2", "memory"
|
||||
);
|
||||
#else
|
||||
matmul2_c(m0, m1, d);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//matrix vector multiplication. d = m * v
|
||||
void
|
||||
matvec2_c(float m[4], float v[2], float d[2])
|
||||
{
|
||||
d[0] = m[0]*v[0] + m[2]*v[1];
|
||||
d[1] = m[1]*v[0] + m[3]*v[1];
|
||||
}
|
||||
|
||||
void
|
||||
matvec2_neon(float m[4], float v[2], float d[2])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 d0, [%1] \n\t" //d0 = v
|
||||
"vld1.32 {d1, d2}, [%0] \n\t" //Q1 = m
|
||||
|
||||
"vmul.f32 d3, d1, d0[0] \n\t" //Q5 = Q1*d0[0]
|
||||
"vmla.f32 d3, d2, d0[1] \n\t" //Q5 += Q1*d0[1]
|
||||
|
||||
"vst1.32 d3, [%2] \n\t" //Q4 = m+12
|
||||
:: "r"(m), "r"(v), "r"(d)
|
||||
: "d0", "d1", "d2","d3", "memory"
|
||||
);
|
||||
#else
|
||||
matvec2_c(m, v, d);
|
||||
#endif
|
||||
}
|
131
deps/math-neon/source/math_mat3.c
vendored
131
deps/math-neon/source/math_mat3.c
vendored
@ -1,131 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Matrices are specified in row major format:
|
||||
|
||||
| x0 x2 |
|
||||
| x1 x3 |
|
||||
|
||||
therefore m[2] = x2
|
||||
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
//matrix matrix multipication. d = m0 * m1;
|
||||
void
|
||||
matmul3_c(float m0[9], float m1[9], float d[9])
|
||||
{
|
||||
d[0] = m0[0]*m1[0] + m0[3]*m1[1] + m0[6]*m1[2];
|
||||
d[1] = m0[1]*m1[0] + m0[4]*m1[1] + m0[7]*m1[2];
|
||||
d[2] = m0[2]*m1[0] + m0[5]*m1[1] + m0[8]*m1[2];
|
||||
d[3] = m0[0]*m1[3] + m0[3]*m1[4] + m0[6]*m1[5];
|
||||
d[4] = m0[1]*m1[3] + m0[4]*m1[4] + m0[7]*m1[5];
|
||||
d[5] = m0[2]*m1[3] + m0[5]*m1[4] + m0[8]*m1[5];
|
||||
d[6] = m0[0]*m1[6] + m0[3]*m1[7] + m0[6]*m1[8];
|
||||
d[7] = m0[1]*m1[6] + m0[4]*m1[7] + m0[7]*m1[8];
|
||||
d[8] = m0[2]*m1[6] + m0[5]*m1[7] + m0[8]*m1[8];
|
||||
}
|
||||
|
||||
void
|
||||
matmul3_neon(float m0[9], float m1[9], float d[9])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1
|
||||
"vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4
|
||||
"flds s8, [%1] \n\t" //q2 = m1+8
|
||||
|
||||
"vld1.32 {d6, d7}, [%0] \n\t" //q3[0] = m0
|
||||
"add %0, %0, #12 \n\t" //q3[0] = m0
|
||||
"vld1.32 {d8, d9}, [%0] \n\t" //q4[0] = m0+12
|
||||
"add %0, %0, #12 \n\t" //q3[0] = m0
|
||||
"vld1.32 {d10}, [%0] \n\t" //q5[0] = m0+24
|
||||
"add %0, %0, #8 \n\t" //q3[0] = m0
|
||||
"flds s22, [%0] \n\t" //q2 = m1+8
|
||||
|
||||
"vmul.f32 q6, q3, d0[0] \n\t" //q12 = q3 * d0[0]
|
||||
"vmul.f32 q7, q3, d1[1] \n\t" //q13 = q3 * d2[0]
|
||||
"vmul.f32 q8, q3, d3[0] \n\t" //q14 = q3 * d4[0]
|
||||
"vmla.f32 q6, q4, d0[1] \n\t" //q12 = q9 * d0[1]
|
||||
"vmla.f32 q7, q4, d2[0] \n\t" //q13 = q9 * d2[1]
|
||||
"vmla.f32 q8, q4, d3[1] \n\t" //q14 = q9 * d4[1]
|
||||
"vmla.f32 q6, q5, d1[0] \n\t" //q12 = q10 * d0[0]
|
||||
"vmla.f32 q7, q5, d2[1] \n\t" //q13 = q10 * d2[0]
|
||||
"vmla.f32 q8, q5, d4[0] \n\t" //q14 = q10 * d4[0]
|
||||
|
||||
"vmov.f32 q0, q8 \n\t" //q14 = q10 * d4[0]
|
||||
"vst1.32 {d12, d13}, [%2] \n\t" //d = q12
|
||||
"add %2, %2, #12 \n\t" //q3[0] = m0
|
||||
"vst1.32 {d14, d15}, [%2] \n\t" //d+4 = q13
|
||||
"add %2, %2, #12 \n\t" //q3[0] = m0
|
||||
"vst1.32 {d0}, [%2] \n\t" //d+8 = q14
|
||||
"add %2, %2, #8 \n\t" //q3[0] = m0
|
||||
"fsts s2, [%2] \n\t" //d = q12
|
||||
|
||||
: "+r"(m0), "+r"(m1), "+r"(d):
|
||||
: "d8", "d9", "d10", "d11", "d12", "d13", "d14", "d15", "memory"
|
||||
);
|
||||
#else
|
||||
matmul3_c(m0, m1, d);
|
||||
#endif
|
||||
};
|
||||
|
||||
//matrix vector multiplication. d = m * v
|
||||
void
|
||||
matvec3_c(float m[9], float v[3], float d[3])
|
||||
{
|
||||
d[0] = m[0]*v[0] + m[3]*v[1] + m[6]*v[2];
|
||||
d[1] = m[1]*v[0] + m[4]*v[1] + m[7]*v[2];
|
||||
d[2] = m[2]*v[0] + m[5]*v[1] + m[8]*v[2];
|
||||
}
|
||||
|
||||
void
|
||||
matvec3_neon(float m[9], float v[3], float d[3])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
int tmp = 0;
|
||||
asm volatile (
|
||||
"mov %3, #12 \n\t" //r3 = 12
|
||||
"vld1.32 {d0, d1}, [%1] \n\t" //Q0 = v
|
||||
"vld1.32 {d2, d3}, [%0], %3 \n\t" //Q1 = m
|
||||
"vld1.32 {d4, d5}, [%0], %3 \n\t" //Q2 = m+12
|
||||
"vld1.32 {d6, d7}, [%0], %3 \n\t" //Q3 = m+24
|
||||
|
||||
"vmul.f32 q9, q1, d0[0] \n\t" //Q9 = Q1*Q0[0]
|
||||
"vmla.f32 q9, q2, d0[1] \n\t" //Q9 += Q2*Q0[1]
|
||||
"vmla.f32 q9, q3, d1[0] \n\t" //Q9 += Q3*Q0[2]
|
||||
"vmov.f32 q0, q9 \n\t" //Q0 = q9
|
||||
|
||||
"vst1.32 d0, [%2]! \n\t" //r2 = D24
|
||||
"fsts s2, [%2] \n\t" //r2 = D25[0]
|
||||
|
||||
: "+r"(m), "+r"(v), "+r"(d), "+r"(tmp):
|
||||
: "q0", "q9", "q10","q11", "q12", "q13", "memory"
|
||||
);
|
||||
#else
|
||||
matvec3_c(m, v, d);
|
||||
#endif
|
||||
}
|
144
deps/math-neon/source/math_mat4.c
vendored
144
deps/math-neon/source/math_mat4.c
vendored
@ -1,144 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Matrices are specified in row major format:
|
||||
|
||||
| x0 x2 |
|
||||
| x1 x3 |
|
||||
|
||||
therefore m[2] = x2
|
||||
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
//matrix matrix multipication. d = m0 * m1;
|
||||
void
|
||||
matmul4_c(float m0[16], float m1[16], float d[16])
|
||||
{
|
||||
d[0] = m0[0]*m1[0] + m0[4]*m1[1] + m0[8]*m1[2] + m0[12]*m1[3];
|
||||
d[1] = m0[1]*m1[0] + m0[5]*m1[1] + m0[9]*m1[2] + m0[13]*m1[3];
|
||||
d[2] = m0[2]*m1[0] + m0[6]*m1[1] + m0[10]*m1[2] + m0[14]*m1[3];
|
||||
d[3] = m0[3]*m1[0] + m0[7]*m1[1] + m0[11]*m1[2] + m0[15]*m1[3];
|
||||
d[4] = m0[0]*m1[4] + m0[4]*m1[5] + m0[8]*m1[6] + m0[12]*m1[7];
|
||||
d[5] = m0[1]*m1[4] + m0[5]*m1[5] + m0[9]*m1[6] + m0[13]*m1[7];
|
||||
d[6] = m0[2]*m1[4] + m0[6]*m1[5] + m0[10]*m1[6] + m0[14]*m1[7];
|
||||
d[7] = m0[3]*m1[4] + m0[7]*m1[5] + m0[11]*m1[6] + m0[15]*m1[7];
|
||||
d[8] = m0[0]*m1[8] + m0[4]*m1[9] + m0[8]*m1[10] + m0[12]*m1[11];
|
||||
d[9] = m0[1]*m1[8] + m0[5]*m1[9] + m0[9]*m1[10] + m0[13]*m1[11];
|
||||
d[10] = m0[2]*m1[8] + m0[6]*m1[9] + m0[10]*m1[10] + m0[14]*m1[11];
|
||||
d[11] = m0[3]*m1[8] + m0[7]*m1[9] + m0[11]*m1[10] + m0[15]*m1[11];
|
||||
d[12] = m0[0]*m1[12] + m0[4]*m1[13] + m0[8]*m1[14] + m0[12]*m1[15];
|
||||
d[13] = m0[1]*m1[12] + m0[5]*m1[13] + m0[9]*m1[14] + m0[13]*m1[15];
|
||||
d[14] = m0[2]*m1[12] + m0[6]*m1[13] + m0[10]*m1[14] + m0[14]*m1[15];
|
||||
d[15] = m0[3]*m1[12] + m0[7]*m1[13] + m0[11]*m1[14] + m0[15]*m1[15];
|
||||
}
|
||||
|
||||
void
|
||||
matmul4_neon(float m0[16], float m1[16], float d[16])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d0, d1}, [%1]! \n\t" //q0 = m1
|
||||
"vld1.32 {d2, d3}, [%1]! \n\t" //q1 = m1+4
|
||||
"vld1.32 {d4, d5}, [%1]! \n\t" //q2 = m1+8
|
||||
"vld1.32 {d6, d7}, [%1] \n\t" //q3 = m1+12
|
||||
"vld1.32 {d16, d17}, [%0]! \n\t" //q8 = m0
|
||||
"vld1.32 {d18, d19}, [%0]! \n\t" //q9 = m0+4
|
||||
"vld1.32 {d20, d21}, [%0]! \n\t" //q10 = m0+8
|
||||
"vld1.32 {d22, d23}, [%0] \n\t" //q11 = m0+12
|
||||
|
||||
"vmul.f32 q12, q8, d0[0] \n\t" //q12 = q8 * d0[0]
|
||||
"vmul.f32 q13, q8, d2[0] \n\t" //q13 = q8 * d2[0]
|
||||
"vmul.f32 q14, q8, d4[0] \n\t" //q14 = q8 * d4[0]
|
||||
"vmul.f32 q15, q8, d6[0] \n\t" //q15 = q8 * d6[0]
|
||||
"vmla.f32 q12, q9, d0[1] \n\t" //q12 = q9 * d0[1]
|
||||
"vmla.f32 q13, q9, d2[1] \n\t" //q13 = q9 * d2[1]
|
||||
"vmla.f32 q14, q9, d4[1] \n\t" //q14 = q9 * d4[1]
|
||||
"vmla.f32 q15, q9, d6[1] \n\t" //q15 = q9 * d6[1]
|
||||
"vmla.f32 q12, q10, d1[0] \n\t" //q12 = q10 * d0[0]
|
||||
"vmla.f32 q13, q10, d3[0] \n\t" //q13 = q10 * d2[0]
|
||||
"vmla.f32 q14, q10, d5[0] \n\t" //q14 = q10 * d4[0]
|
||||
"vmla.f32 q15, q10, d7[0] \n\t" //q15 = q10 * d6[0]
|
||||
"vmla.f32 q12, q11, d1[1] \n\t" //q12 = q11 * d0[1]
|
||||
"vmla.f32 q13, q11, d3[1] \n\t" //q13 = q11 * d2[1]
|
||||
"vmla.f32 q14, q11, d5[1] \n\t" //q14 = q11 * d4[1]
|
||||
"vmla.f32 q15, q11, d7[1] \n\t" //q15 = q11 * d6[1]
|
||||
|
||||
"vst1.32 {d24, d25}, [%2]! \n\t" //d = q12
|
||||
"vst1.32 {d26, d27}, [%2]! \n\t" //d+4 = q13
|
||||
"vst1.32 {d28, d29}, [%2]! \n\t" //d+8 = q14
|
||||
"vst1.32 {d30, d31}, [%2] \n\t" //d+12 = q15
|
||||
|
||||
: "+r"(m0), "+r"(m1), "+r"(d) :
|
||||
: "q0", "q1", "q2", "q3", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15",
|
||||
"memory"
|
||||
);
|
||||
#else
|
||||
matmul4_c(m0, m1, d);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
//matrix vector multiplication. d = m * v
|
||||
void
|
||||
matvec4_c(float m[16], float v[4], float d[4])
|
||||
{
|
||||
d[0] = m[0]*v[0] + m[4]*v[1] + m[8]*v[2] + m[12]*v[3];
|
||||
d[1] = m[1]*v[0] + m[5]*v[1] + m[9]*v[2] + m[13]*v[3];
|
||||
d[2] = m[2]*v[0] + m[6]*v[1] + m[10]*v[2] + m[14]*v[3];
|
||||
d[3] = m[3]*v[0] + m[7]*v[1] + m[11]*v[2] + m[15]*v[3];
|
||||
}
|
||||
|
||||
void
|
||||
matvec4_neon(float m[16], float v[4], float d[4])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d0, d1}, [%1] \n\t" //Q0 = v
|
||||
"vld1.32 {d18, d19}, [%0]! \n\t" //Q1 = m
|
||||
"vld1.32 {d20, d21}, [%0]! \n\t" //Q2 = m+4
|
||||
"vld1.32 {d22, d23}, [%0]! \n\t" //Q3 = m+8
|
||||
"vld1.32 {d24, d25}, [%0]! \n\t" //Q4 = m+12
|
||||
|
||||
"vmul.f32 q13, q9, d0[0] \n\t" //Q5 = Q1*Q0[0]
|
||||
"vmla.f32 q13, q10, d0[1] \n\t" //Q5 += Q1*Q0[1]
|
||||
"vmla.f32 q13, q11, d1[0] \n\t" //Q5 += Q2*Q0[2]
|
||||
"vmla.f32 q13, q12, d1[1] \n\t" //Q5 += Q3*Q0[3]
|
||||
|
||||
"vst1.32 {d26, d27}, [%2] \n\t" //Q4 = m+12
|
||||
:
|
||||
: "r"(m), "r"(v), "r"(d)
|
||||
: "q0", "q9", "q10","q11", "q12", "q13", "memory"
|
||||
);
|
||||
#else
|
||||
matvec4_c(m, v, d);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
|
||||
|
||||
|
71
deps/math-neon/source/math_modf.c
vendored
71
deps/math-neon/source/math_modf.c
vendored
@ -1,71 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Assumes the floating point value |x| < 2,147,483,648
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
float modf_c(float x, int *i)
|
||||
{
|
||||
int n;
|
||||
n = (int)x;
|
||||
*i = n;
|
||||
x = x - (float)n;
|
||||
return x;
|
||||
}
|
||||
|
||||
|
||||
float modf_neon_hfp(float x, int *i)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0;
|
||||
"vcvt.f32.s32 d2, d1 \n\t" //d2 = (float) d1;
|
||||
"vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2;
|
||||
"vstr.i32 s2, [r0] \n\t" //[r0] = d1[0]
|
||||
::: "d0", "d1", "d2"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
float modf_neon_sfp(float x, int *i)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vdup.f32 d0, r0 \n\t" //d0 = {x, x}
|
||||
"vcvt.s32.f32 d1, d0 \n\t" //d1 = (int) d0;
|
||||
"vcvt.f32.s32 d2, d1 \n\t" //d2 = (float) d1;
|
||||
"vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2;
|
||||
"vstr.i32 s2, [r1] \n\t" //[r0] = d1[0]
|
||||
"vmov.f32 r0, s0 \n\t" //r0 = d0[0];
|
||||
::: "d0", "d1", "d2"
|
||||
);
|
||||
|
||||
#else
|
||||
return modf_c(x, i);
|
||||
#endif
|
||||
}
|
439
deps/math-neon/source/math_neon.h
vendored
439
deps/math-neon/source/math_neon.h
vendored
@ -1,439 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#ifndef __MATH_NEON_H__
|
||||
#define __MATH_NEON_H__
|
||||
|
||||
#if !defined(__i386__) && defined(__arm__)
|
||||
//if defined neon ASM routines are used, otherwise all calls to *_neon
|
||||
//functions are rerouted to their equivalent *_c function.
|
||||
#define __MATH_NEON
|
||||
|
||||
//Default Floating Point value ABI: 0=softfp, 1=hardfp. Only effects *_neon routines.
|
||||
//You can access the hardfp versions directly via the *_hard suffix.
|
||||
//You can access the softfp versions directly via the *_soft suffix.
|
||||
#define __MATH_FPABI 1
|
||||
|
||||
#endif
|
||||
|
||||
#ifdef GCC
|
||||
#define ALIGN(A) __attribute__ ((aligned (A))
|
||||
#else
|
||||
#define ALIGN(A)
|
||||
#endif
|
||||
|
||||
#ifndef _MATH_H
|
||||
#define M_PI 3.14159265358979323846 /* pi */
|
||||
#define M_PI_2 1.57079632679489661923 /* pi/2 */
|
||||
#define M_PI_4 0.78539816339744830962 /* pi/4 */
|
||||
#define M_E 2.7182818284590452354 /* e */
|
||||
#define M_LOG2E 1.4426950408889634074 /* log_2 e */
|
||||
#define M_LOG10E 0.43429448190325182765 /* log_10 e */
|
||||
#define M_LN2 0.69314718055994530942 /* log_e 2 */
|
||||
#define M_LN10 2.30258509299404568402 /* log_e 10 */
|
||||
#define M_1_PI 0.31830988618379067154 /* 1/pi */
|
||||
#define M_2_PI 0.63661977236758134308 /* 2/pi */
|
||||
#define M_2_SQRTPI 1.12837916709551257390 /* 2/sqrt(pi) */
|
||||
#define M_SQRT2 1.41421356237309504880 /* sqrt(2) */
|
||||
#define M_SQRT1_2 0.70710678118654752440 /* 1/sqrt(2) */
|
||||
#endif
|
||||
|
||||
#if __MATH_FPABI == 1
|
||||
#define sinf_neon sinf_neon_hfp
|
||||
#define cosf_neon cosf_neon_hfp
|
||||
#define sincosf_neon sincosf_neon_hfp
|
||||
#define tanf_neon tanf_neon_hfp
|
||||
#define atanf_neon atanf_neon_hfp
|
||||
#define atan2f_neon atan2f_neon_hfp
|
||||
#define asinf_neon asinf_neon_hfp
|
||||
#define acosf_neon acosf_neon_hfp
|
||||
#define sinhf_neon sinhf_neon_hfp
|
||||
#define coshf_neon coshf_neon_hfp
|
||||
#define tanhf_neon tanhf_neon_hfp
|
||||
#define expf_neon expf_neon_hfp
|
||||
#define logf_neon logf_neon_hfp
|
||||
#define log10f_neon log10f_neon_hfp
|
||||
#define powf_neon powf_neon_hfp
|
||||
#define floorf_neon floorf_neon_hfp
|
||||
#define ceilf_neon ceilf_neon_hfp
|
||||
#define fabsf_neon fabsf_neon_hfp
|
||||
#define ldexpf_neon ldexpf_neon_hfp
|
||||
#define frexpf_neon frexpf_neon_hfp
|
||||
#define fmodf_neon fmodf_neon_hfp
|
||||
#define modf_neon modf_neon_hfp
|
||||
#define sqrtf_neon sqrtf_neon_hfp
|
||||
#define invsqrtf_neon invsqrtf_neon_hfp
|
||||
|
||||
#define dot2_neon dot2_neon_hfp
|
||||
#define dot3_neon dot3_neon_hfp
|
||||
#define dot4_neon dot4_neon_hfp
|
||||
#else
|
||||
#define sinf_neon sinf_neon_sfp
|
||||
#define cosf_neon cosf_neon_sfp
|
||||
#define sincosf_neon sincosf_neon_sfp
|
||||
#define tanf_neon tanf_neon_sfp
|
||||
#define atanf_neon atanf_neon_sfp
|
||||
#define atan2f_neon atan2f_neon_sfp
|
||||
#define asinf_neon asinf_neon_sfp
|
||||
#define acosf_neon acosf_neon_sfp
|
||||
#define sinhf_neon sinhf_neon_sfp
|
||||
#define coshf_neon coshf_neon_sfp
|
||||
#define tanhf_neon tanhf_neon_sfp
|
||||
#define expf_neon expf_neon_sfp
|
||||
#define logf_neon logf_neon_sfp
|
||||
#define log10f_neon log10f_neon_sfp
|
||||
#define powf_neon powf_neon_sfp
|
||||
#define floorf_neon floorf_neon_sfp
|
||||
#define ceilf_neon ceilf_neon_sfp
|
||||
#define fabsf_neon fabsf_neon_sfp
|
||||
#define ldexpf_neon ldexpf_neon_sfp
|
||||
#define frexpf_neon frexpf_neon_sfp
|
||||
#define fmodf_neon fmodf_neon_sfp
|
||||
#define modf_neon modf_neon_sfp
|
||||
#define sqrtf_neon sqrtf_neon_sfp
|
||||
#define invsqrtf_neon invsqrtf_neon_sfp
|
||||
|
||||
#define dot2_neon dot2_neon_sfp
|
||||
#define dot3_neon dot3_neon_sfp
|
||||
#define dot4_neon dot4_neon_sfp
|
||||
#endif
|
||||
|
||||
/*
|
||||
function: enable_runfast
|
||||
this function enables the floating point runfast mode on the
|
||||
ARM Cortex A8.
|
||||
*/
|
||||
void enable_runfast();
|
||||
|
||||
|
||||
float dot2_c(float v0[2], float v1[2]);
|
||||
float dot2_neon(float v0[2], float v1[2]);
|
||||
float dot3_c(float v0[3], float v1[3]);
|
||||
float dot3_neon(float v0[3], float v1[3]);
|
||||
float dot4_c(float v0[4], float v1[4]);
|
||||
float dot4_neon(float v0[4], float v1[4]);
|
||||
|
||||
void cross3_c(float v0[3], float v1[3], float d[3]);
|
||||
void cross3_neon(float v0[3], float v1[3], float d[3]);
|
||||
|
||||
void normalize2_c(float v[2], float d[2]);
|
||||
void normalize2_neon(float v[2], float d[2]);
|
||||
void normalize3_c(float v[3], float d[3]);
|
||||
void normalize3_neon(float v[3], float d[3]);
|
||||
void normalize4_c(float v[4], float d[4]);
|
||||
void normalize4_neon(float v[4], float d[4]);
|
||||
|
||||
/*
|
||||
function: matmul2
|
||||
arguments: m0 2x2 matrix, m1 2x2 matrix
|
||||
return: d 2x2 matrix
|
||||
expression: d = m0 * m1
|
||||
*/
|
||||
void matmul2_c(float m0[4], float m1[4], float d[4]);
|
||||
void matmul2_neon(float m0[4], float m1[4], float d[4]);
|
||||
|
||||
/*
|
||||
function: matmul3
|
||||
arguments: m0 3x3 matrix, m1 3x3 matrix
|
||||
return: d 3x3 matrix
|
||||
expression: d = m0 * m1
|
||||
*/
|
||||
void matmul3_c(float m0[9], float m1[9], float d[9]);
|
||||
void matmul3_neon(float m0[9], float m1[9], float d[9]);
|
||||
|
||||
/*
|
||||
function: matmul4
|
||||
arguments: m0 4x4 matrix, m1 4x4 matrix
|
||||
return: d 4x4 matrix
|
||||
expression: d = m0 * m1
|
||||
*/
|
||||
void matmul4_c(float m0[16], float m1[16], float d[16]);
|
||||
void matmul4_neon(float m0[16], float m1[16], float d[16]);
|
||||
|
||||
/*
|
||||
function: matvec2
|
||||
arguments: m 2x2 matrix, v 2 element vector
|
||||
return: d 2x2 matrix
|
||||
expression: d = m * v
|
||||
*/
|
||||
void matvec2_c(float m[4], float v[2], float d[2]);
|
||||
void matvec2_neon(float m[4], float v[2], float d[2]);
|
||||
|
||||
/*
|
||||
function: matvec3
|
||||
arguments: m 3x3 matrix, v 3 element vector
|
||||
return: d 3x3 matrix
|
||||
expression: d = m * v
|
||||
*/
|
||||
void matvec3_c(float m[9], float v[3], float d[3]);
|
||||
void matvec3_neon(float m[9], float v[3], float d[3]);
|
||||
|
||||
/*
|
||||
function: matvec4
|
||||
arguments: m 4x4 matrix, v 4 element vector
|
||||
return: d 4x4 matrix
|
||||
expression: d = m * v
|
||||
*/
|
||||
void matvec4_c(float m[16], float v[4], float d[4]);
|
||||
void matvec4_neon(float m[16], float v[4], float d[4]);
|
||||
|
||||
/*
|
||||
function: sinf
|
||||
arguments: x radians
|
||||
return: the sine function evaluated at x radians.
|
||||
expression: r = sin(x)
|
||||
*/
|
||||
float sinf_c(float x);
|
||||
float sinf_neon_hfp(float x);
|
||||
float sinf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: cosf
|
||||
arguments: x radians
|
||||
return: the cosine function evaluated at x radians.
|
||||
expression: r = cos(x)
|
||||
notes: computed using cos(x) = sin(x + pi/2)
|
||||
*/
|
||||
float cosf_c(float x);
|
||||
float cosf_neon_hfp(float x);
|
||||
float cosf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: sincosf
|
||||
arguments: x radians, r[2] result array.
|
||||
return: both the sine and the cosine evaluated at x radians.
|
||||
expression: r = {sin(x), cos(x)}
|
||||
notes: faster than evaluating seperately.
|
||||
*/
|
||||
void sincosf_c(float x, float r[2]);
|
||||
void sincosf_neon_hfp(float x, float r[2]);
|
||||
void sincosf_neon_sfp(float x, float r[2]);
|
||||
|
||||
/*
|
||||
function: sinfv
|
||||
return: the sine function evaluated at x[i] radians
|
||||
expression: r[i] = sin(x[i])
|
||||
notes: faster than evaluating individually.
|
||||
r and x can be the same memory location.
|
||||
*/
|
||||
void sinfv_c(float *x, int n, float *r);
|
||||
void sinfv_neon(float *x, int n, float *r);
|
||||
|
||||
/*
|
||||
function: tanf
|
||||
return: the tangent evaluated at x radians.
|
||||
expression: r = tan(x)
|
||||
notes: computed using tan(x) = sin(x) / cos(x)
|
||||
*/
|
||||
float tanf_c(float x);
|
||||
float tanf_neon_hfp(float x);
|
||||
float tanf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: atanf
|
||||
return: the arctangent evaluated at x.
|
||||
expression: r = atan(x)
|
||||
*/
|
||||
float atanf_c(float x);
|
||||
float atanf_neon_hfp(float x);
|
||||
float atanf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: atanf
|
||||
return: the arctangent evaluated at x.
|
||||
expression: r = atan(x)
|
||||
*/
|
||||
float atan2f_c(float y, float x);
|
||||
float atan2f_neon_hfp(float y, float x);
|
||||
float atan2f_neon_sfp(float y, float x);
|
||||
|
||||
/*
|
||||
function: asinf
|
||||
return: the arcsine evaluated at x.
|
||||
expression: r = asin(x)
|
||||
*/
|
||||
float asinf_c(float x);
|
||||
float asinf_neon_hfp(float x);
|
||||
float asinf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: acosf
|
||||
return: the arcsine evaluated at x.
|
||||
expression: r = asin(x)
|
||||
*/
|
||||
float acosf_c(float x);
|
||||
float acosf_neon_hfp(float x);
|
||||
float acosf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: sinhf
|
||||
return: the arcsine evaluated at x.
|
||||
expression: r = asin(x)
|
||||
*/
|
||||
float sinhf_c(float x);
|
||||
float sinhf_neon_hfp(float x);
|
||||
float sinhf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: coshf
|
||||
return: the arcsine evaluated at x.
|
||||
expression: r = asin(x)
|
||||
*/
|
||||
float coshf_c(float x);
|
||||
float coshf_neon_hfp(float x);
|
||||
float coshf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: tanhf
|
||||
return: the arcsine evaluated at x.
|
||||
expression: r = asin(x)
|
||||
*/
|
||||
float tanhf_c(float x);
|
||||
float tanhf_neon_hfp(float x);
|
||||
float tanhf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: expf
|
||||
return: the natural exponential evaluated at x.
|
||||
expression: r = e ** x
|
||||
*/
|
||||
float expf_c(float x);
|
||||
float expf_neon_hfp(float x);
|
||||
float expf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: logf
|
||||
return: the value of the natural logarithm of x.
|
||||
expression: r = ln(x)
|
||||
notes: assumes x > 0
|
||||
*/
|
||||
float logf_c(float x);
|
||||
float logf_neon_hfp(float x);
|
||||
float logf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: log10f
|
||||
return: the value of the power 10 logarithm of x.
|
||||
expression: r = log10(x)
|
||||
notes: assumes x > 0
|
||||
*/
|
||||
float log10f_c(float x);
|
||||
float log10f_neon_hfp(float x);
|
||||
float log10f_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: powf
|
||||
return: x raised to the power of n, x ** n.
|
||||
expression: r = x ** y
|
||||
notes: computed using e ** (y * ln(x))
|
||||
*/
|
||||
float powf_c(float x, float n);
|
||||
float powf_neon_sfp(float x, float n);
|
||||
float powf_neon_hfp(float x, float n);
|
||||
|
||||
/*
|
||||
function: floorf
|
||||
return: x rounded down (towards negative infinity) to its nearest
|
||||
integer value.
|
||||
notes: assumes |x| < 2 ** 31
|
||||
*/
|
||||
float floorf_c(float x);
|
||||
float floorf_neon_sfp(float x);
|
||||
float floorf_neon_hfp(float x);
|
||||
|
||||
/*
|
||||
function: ceilf
|
||||
return: x rounded up (towards positive infinity) to its nearest
|
||||
integer value.
|
||||
notes: assumes |x| < 2 ** 31
|
||||
*/
|
||||
float ceilf_c(float x);
|
||||
float ceilf_neon_hfp(float x);
|
||||
float ceilf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: fabsf
|
||||
return: absolute vvalue of x
|
||||
notes: assumes |x| < 2 ** 31
|
||||
*/
|
||||
float fabsf_c(float x);
|
||||
float fabsf_neon_hfp(float x);
|
||||
float fabsf_neon_sfp(float x);
|
||||
|
||||
/*
|
||||
function: ldexpf
|
||||
return: the value of m multiplied by 2 to the power of e.
|
||||
expression: r = m * (2 ** e)
|
||||
*/
|
||||
float ldexpf_c(float m, int e);
|
||||
float ldexpf_neon_hfp(float m, int e);
|
||||
float ldexpf_neon_sfp(float m, int e);
|
||||
|
||||
/*
|
||||
function: frexpf
|
||||
return: the exponent and mantissa of x
|
||||
*/
|
||||
float frexpf_c(float x, int *e);
|
||||
float frexpf_neon_hfp(float x, int *e);
|
||||
float frexpf_neon_sfp(float x, int *e);
|
||||
|
||||
/*
|
||||
function: fmodf
|
||||
return: the remainder of x divided by y, x % y
|
||||
expression: r = x - floor(x / y) * y;
|
||||
notes: assumes that |x / y| < 2 ** 31
|
||||
*/
|
||||
float fmodf_c(float x, float y);
|
||||
float fmodf_neon_hfp(float x, float y);
|
||||
float fmodf_neon_sfp(float x, float y);
|
||||
|
||||
/*
|
||||
function: modf
|
||||
return: breaks x into the integer (i) and fractional part (return)
|
||||
notes: assumes that |x| < 2 ** 31
|
||||
*/
|
||||
float modf_c(float x, int *i);
|
||||
float modf_neon_hfp(float x, int *i);
|
||||
float modf_neon_sfp(float x, int *i);
|
||||
|
||||
/*
|
||||
function: sqrtf
|
||||
return: (x^0.5)
|
||||
notes:
|
||||
*/
|
||||
float sqrtf_c(float x);
|
||||
float sqrtf_neon_hfp(float x);
|
||||
float sqrtf_neon_sfp(float x);
|
||||
|
||||
|
||||
/*
|
||||
function: invsqrtf
|
||||
return: 1.0f / (x^0.5)
|
||||
notes:
|
||||
*/
|
||||
float invsqrtf_c(float x);
|
||||
float invsqrtf_neon_hfp(float x);
|
||||
float invsqrtf_neon_sfp(float x);
|
||||
|
||||
#endif
|
182
deps/math-neon/source/math_powf.c
vendored
182
deps/math-neon/source/math_powf.c
vendored
@ -1,182 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Based on x ^ n = exp(n * log(x))
|
||||
|
||||
Test func : powf(x, n)
|
||||
Test Range: (1,1) < (x, n) < (10, 10)
|
||||
Peak Error: ~0.0010%
|
||||
RMS Error: ~0.0002%
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __powf_rng[2] = {
|
||||
1.442695041f,
|
||||
0.693147180f
|
||||
};
|
||||
|
||||
const float __powf_lut[16] = {
|
||||
-2.295614848256274, //p0 log
|
||||
-2.470711633419806, //p4
|
||||
-5.686926051100417, //p2
|
||||
-0.165253547131978, //p6
|
||||
+5.175912446351073, //p1
|
||||
+0.844006986174912, //p5
|
||||
+4.584458825456749, //p3
|
||||
+0.014127821926000, //p7
|
||||
0.9999999916728642, //p0 exp
|
||||
0.04165989275009526, //p4
|
||||
0.5000006143673624, //p2
|
||||
0.0014122663401803872, //p6
|
||||
1.000000059694879, //p1
|
||||
0.008336936973260111, //p5
|
||||
0.16666570253074878, //p3
|
||||
0.00019578093328483123 //p7
|
||||
};
|
||||
|
||||
float powf_c(float x, float n)
|
||||
{
|
||||
float a, b, c, d, xx;
|
||||
int m;
|
||||
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} r;
|
||||
|
||||
//extract exponent
|
||||
r.f = x;
|
||||
m = (r.i >> 23);
|
||||
m = m - 127;
|
||||
r.i = r.i - (m << 23);
|
||||
|
||||
//Taylor Polynomial (Estrins)
|
||||
xx = r.f * r.f;
|
||||
a = (__powf_lut[4] * r.f) + (__powf_lut[0]);
|
||||
b = (__powf_lut[6] * r.f) + (__powf_lut[2]);
|
||||
c = (__powf_lut[5] * r.f) + (__powf_lut[1]);
|
||||
d = (__powf_lut[7] * r.f) + (__powf_lut[3]);
|
||||
a = a + b * xx;
|
||||
c = c + d * xx;
|
||||
xx = xx * xx;
|
||||
r.f = a + c * xx;
|
||||
|
||||
//add exponent
|
||||
r.f = r.f + ((float) m) * __powf_rng[1];
|
||||
|
||||
r.f = r.f * n;
|
||||
|
||||
|
||||
//Range Reduction:
|
||||
m = (int) (r.f * __powf_rng[0]);
|
||||
r.f = r.f - ((float) m) * __powf_rng[1];
|
||||
|
||||
//Taylor Polynomial (Estrins)
|
||||
a = (__powf_lut[12] * r.f) + (__powf_lut[8]);
|
||||
b = (__powf_lut[14] * r.f) + (__powf_lut[10]);
|
||||
c = (__powf_lut[13] * r.f) + (__powf_lut[9]);
|
||||
d = (__powf_lut[15] * r.f) + (__powf_lut[11]);
|
||||
xx = r.f * r.f;
|
||||
a = a + b * xx;
|
||||
c = c + d * xx;
|
||||
xx = xx* xx;
|
||||
r.f = a + c * xx;
|
||||
|
||||
//multiply by 2 ^ m
|
||||
m = m << 23;
|
||||
r.i = r.i + m;
|
||||
|
||||
return r.f;
|
||||
}
|
||||
|
||||
float powf_neon_hfp(float x, float n)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vdup.f32 d16, d0[1] \n\t" //d16 = {y,y};
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x,x};
|
||||
|
||||
//extract exponent
|
||||
"vmov.i32 d2, #127 \n\t" //d2 = 127;
|
||||
"vshr.u32 d6, d0, #23 \n\t" //d6 = d0 >> 23;
|
||||
"vsub.i32 d6, d6, d2 \n\t" //d6 = d6 - d2;
|
||||
"vshl.u32 d1, d6, #23 \n\t" //d1 = d6 << 23;
|
||||
"vsub.i32 d0, d0, d1 \n\t" //d0 = d0 + d1;
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2}
|
||||
"vld1.32 {d2, d3, d4, d5}, [%1]! \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
|
||||
"vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0]
|
||||
"vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0]
|
||||
"vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4}
|
||||
"vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1]
|
||||
|
||||
//add exponent
|
||||
"vld1.32 d7, [%0] \n\t" //d7 = {invrange, range}
|
||||
"vcvt.f32.s32 d6, d6 \n\t" //d6 = (float) d6
|
||||
"vmla.f32 d2, d6, d7[1] \n\t" //d2 = d2 + d6 * d7[1]
|
||||
|
||||
"vdup.f32 d0, d2[0] \n\t" //d0 = d2[0]
|
||||
"vmul.f32 d0, d0, d16 \n\t" //d0 = d0 * d16
|
||||
|
||||
//Range Reduction:
|
||||
"vmul.f32 d6, d0, d7[0] \n\t" //d6 = d0 * d7[0]
|
||||
"vcvt.u32.f32 d6, d6 \n\t" //d6 = (int) d6
|
||||
"vcvt.f32.u32 d1, d6 \n\t" //d1 = (float) d6
|
||||
"vmls.f32 d0, d1, d7[1] \n\t" //d0 = d0 - d1 * d7[1]
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d1, d0, d0 \n\t" //d1 = d0*d0 = {x^2, x^2}
|
||||
"vld1.32 {d2, d3, d4, d5}, [%1] \n\t" //q1 = {p0, p4, p2, p6}, q2 = {p1, p5, p3, p7} ;
|
||||
"vmla.f32 q1, q2, d0[0] \n\t" //q1 = q1 + q2 * d0[0]
|
||||
"vmla.f32 d2, d3, d1[0] \n\t" //d2 = d2 + d3 * d1[0]
|
||||
"vmul.f32 d1, d1, d1 \n\t" //d1 = d1 * d1 = {x^4, x^4}
|
||||
"vmla.f32 d2, d1, d2[1] \n\t" //d2 = d2 + d1 * d2[1]
|
||||
|
||||
//multiply by 2 ^ m
|
||||
"vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23
|
||||
"vadd.i32 d0, d2, d6 \n\t" //d0 = d2 + d6
|
||||
|
||||
|
||||
:: "r"(__powf_rng), "r"(__powf_lut)
|
||||
: "d0", "d1", "d2","d3", "d4", "d5", "d6", "d7"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float powf_neon_sfp(float x, float n)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
asm volatile ("vmov.f32 s1, r1 \n\t");
|
||||
powf_neon_hfp(x, n);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return powf_c(x, n);
|
||||
#endif
|
||||
};
|
42
deps/math-neon/source/math_runfast.c
vendored
42
deps/math-neon/source/math_runfast.c
vendored
@ -1,42 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
void
|
||||
enable_runfast()
|
||||
{
|
||||
#ifdef __arm__
|
||||
static const unsigned int x = 0x04086060;
|
||||
static const unsigned int y = 0x03000000;
|
||||
int r;
|
||||
asm volatile (
|
||||
"fmrx %0, fpscr \n\t" //r0 = FPSCR
|
||||
"and %0, %0, %1 \n\t" //r0 = r0 & 0x04086060
|
||||
"orr %0, %0, %2 \n\t" //r0 = r0 | 0x03000000
|
||||
"fmxr fpscr, %0 \n\t" //FPSCR = r0
|
||||
: "=r"(r)
|
||||
: "r"(x), "r"(y)
|
||||
);
|
||||
#endif
|
||||
}
|
163
deps/math-neon/source/math_sincosf.c
vendored
163
deps/math-neon/source/math_sincosf.c
vendored
@ -1,163 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __sincosf_rng[2] = {
|
||||
2.0 / M_PI,
|
||||
M_PI / 2.0
|
||||
};
|
||||
|
||||
const float __sincosf_lut[8] = {
|
||||
-0.00018365f, //p7
|
||||
-0.00018365f, //p7
|
||||
+0.00830636f, //p5
|
||||
+0.00830636f, //p5
|
||||
-0.16664831f, //p3
|
||||
-0.16664831f, //p3
|
||||
+0.99999661f, //p1
|
||||
+0.99999661f, //p1
|
||||
};
|
||||
|
||||
void sincosf_c( float x, float r[2])
|
||||
{
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} ax, bx;
|
||||
|
||||
float xx, yy;
|
||||
int m, n, o, p;
|
||||
|
||||
float y = x + __sincosf_rng[1];
|
||||
|
||||
ax.f = fabsf(x);
|
||||
bx.f = fabsf(y);
|
||||
|
||||
//Range Reduction:
|
||||
m = (int) (ax.f * __sincosf_rng[0]);
|
||||
o = (int) (bx.f * __sincosf_rng[0]);
|
||||
ax.f = ax.f - (((float)m) * __sincosf_rng[1]);
|
||||
bx.f = bx.f - (((float)o) * __sincosf_rng[1]);
|
||||
|
||||
//Test Quadrant
|
||||
n = m & 1;
|
||||
p = o & 1;
|
||||
ax.f = ax.f - n * __sincosf_rng[1];
|
||||
bx.f = bx.f - p * __sincosf_rng[1];
|
||||
m = m >> 1;
|
||||
o = o >> 1;
|
||||
n = n ^ m;
|
||||
p = p ^ o;
|
||||
m = (x < 0.0);
|
||||
o = (y < 0.0);
|
||||
n = n ^ m;
|
||||
p = p ^ o;
|
||||
n = n << 31;
|
||||
p = p << 31;
|
||||
ax.i = ax.i ^ n;
|
||||
bx.i = bx.i ^ p;
|
||||
|
||||
//Taylor Polynomial
|
||||
xx = ax.f * ax.f;
|
||||
yy = bx.f * bx.f;
|
||||
r[0] = __sincosf_lut[0];
|
||||
r[1] = __sincosf_lut[1];
|
||||
r[0] = r[0] * xx + __sincosf_lut[2];
|
||||
r[1] = r[1] * yy + __sincosf_lut[3];
|
||||
r[0] = r[0] * xx + __sincosf_lut[4];
|
||||
r[1] = r[1] * yy + __sincosf_lut[5];
|
||||
r[0] = r[0] * xx + __sincosf_lut[6];
|
||||
r[1] = r[1] * yy + __sincosf_lut[7];
|
||||
r[0] = r[0] * ax.f;
|
||||
r[1] = r[1] * bx.f;
|
||||
|
||||
}
|
||||
|
||||
void sincosf_neon_hfp(float x, float r[2])
|
||||
{
|
||||
//HACK: Assumes for softfp that r1 = x, and for hardfp that s0 = x.
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
//{x, y} = {x, x + pi/2}
|
||||
"vdup.f32 d1, d0[0] \n\t" //d1 = {x, x}
|
||||
"vld1.32 d3, [%1] \n\t" //d3 = {invrange, range}
|
||||
"vadd.f32 d0, d1, d3 \n\t" //d0 = d1 + d3
|
||||
"vmov.f32 s0, s2 \n\t" //d0[0] = d1[0]
|
||||
"vabs.f32 d1, d0 \n\t" //d1 = {abs(x), abs(y)}
|
||||
|
||||
//Range Reduction:
|
||||
"vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0]
|
||||
"vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2
|
||||
"vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2
|
||||
"vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1]
|
||||
|
||||
//Checking Quadrant:
|
||||
//ax = ax - (k&1) * M_PI_2
|
||||
"vmov.i32 d4, #1 \n\t" //d4 = 1
|
||||
"vand.i32 d4, d4, d2 \n\t" //d4 = d4 & d2
|
||||
"vcvt.f32.u32 d5, d4 \n\t" //d5 = (float) d4
|
||||
"vmls.f32 d1, d5, d3[1] \n\t" //d1 = d1 - d5 * d3[1]
|
||||
|
||||
//ax = ax ^ ((k & 1) ^ (k >> 1) ^ (x < 0) << 31)
|
||||
"vshr.u32 d3, d2, #1 \n\t" //d3 = d2 >> 1
|
||||
"veor.i32 d4, d4, d3 \n\t" //d4 = d4 ^ d3
|
||||
"vclt.f32 d3, d0, #0 \n\t" //d3 = (d0 < 0.0)
|
||||
"veor.i32 d4, d4, d3 \n\t" //d4 = d4 ^ d3
|
||||
"vshl.i32 d4, d4, #31 \n\t" //d4 = d4 << 31
|
||||
"veor.i32 d0, d1, d4 \n\t" //d0 = d1 ^ d4
|
||||
|
||||
//polynomial:
|
||||
"vldm %2!, {d2, d3} \n\t" //d2 = {p7, p7}, d3 = {p5, p5}, r3 += 4;
|
||||
"vmul.f32 d1, d0, d0 \n\t" //d1 = d0 * d0 = {x^2, y^2}
|
||||
"vldm %2!, {d4} \n\t" //d4 = {p3, p3}, r3 += 2;
|
||||
"vmla.f32 d3, d2, d1 \n\t" //d3 = d3 + d2 * d1;
|
||||
"vldm %2!, {d5} \n\t" //d5 = {p1, p1}, r3 += 2;
|
||||
"vmla.f32 d4, d3, d1 \n\t" //d4 = d4 + d3 * d1;
|
||||
"vmla.f32 d5, d4, d1 \n\t" //d5 = d5 + d4 * d1;
|
||||
"vmul.f32 d5, d5, d0 \n\t" //d5 = d5 * d0;
|
||||
|
||||
"vstm.f32 %0, {d5} \n\t" //r[0] = d5[0], r[1]=d5[1];
|
||||
|
||||
: "+r"(r)
|
||||
: "r"(__sincosf_rng), "r"(__sincosf_lut)
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5"
|
||||
);
|
||||
#else
|
||||
sincosf_c(x, r);
|
||||
#endif
|
||||
}
|
||||
|
||||
void sincosf_neon_sfp(float x, float r[2])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vdup.f32 d0, r0 \n\t");
|
||||
sincosf_neon_hfp(x, r);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
sincosf_c(x, r);
|
||||
#endif
|
||||
};
|
||||
|
128
deps/math-neon/source/math_sinf.c
vendored
128
deps/math-neon/source/math_sinf.c
vendored
@ -1,128 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <math.h>
|
||||
#include "math_neon.h"
|
||||
|
||||
static const float __sinf_rng[2] = {
|
||||
2.0 / M_PI,
|
||||
M_PI / 2.0
|
||||
} ALIGN(16);
|
||||
|
||||
static const float __sinf_lut[4] = {
|
||||
-0.00018365f, //p7
|
||||
-0.16664831f, //p3
|
||||
+0.00830636f, //p5
|
||||
+0.99999661f, //p1
|
||||
} ALIGN(16);
|
||||
|
||||
float sinf_c(float x)
|
||||
{
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} ax;
|
||||
|
||||
float r, a, b, xx;
|
||||
int m, n;
|
||||
|
||||
ax.f = fabsf(x);
|
||||
|
||||
//Range Reduction:
|
||||
m = (int) (ax.f * __sinf_rng[0]);
|
||||
ax.f = ax.f - (((float)m) * __sinf_rng[1]);
|
||||
|
||||
//Test Quadrant
|
||||
n = m & 1;
|
||||
ax.f = ax.f - n * __sinf_rng[1];
|
||||
m = m >> 1;
|
||||
n = n ^ m;
|
||||
m = (x < 0.0);
|
||||
n = n ^ m;
|
||||
n = n << 31;
|
||||
ax.i = ax.i ^ n;
|
||||
|
||||
//Taylor Polynomial (Estrins)
|
||||
xx = ax.f * ax.f;
|
||||
a = (__sinf_lut[0] * ax.f) * xx + (__sinf_lut[2] * ax.f);
|
||||
b = (__sinf_lut[1] * ax.f) * xx + (__sinf_lut[3] * ax.f);
|
||||
xx = xx * xx;
|
||||
r = b + a * xx;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
float sinf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vld1.32 d3, [%0] \n\t" //d3 = {invrange, range}
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}
|
||||
"vabs.f32 d1, d0 \n\t" //d1 = {ax, ax}
|
||||
|
||||
"vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0]
|
||||
"vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2
|
||||
"vmov.i32 d5, #1 \n\t" //d5 = 1
|
||||
"vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2
|
||||
"vshr.u32 d7, d2, #1 \n\t" //d7 = d2 >> 1
|
||||
"vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1]
|
||||
|
||||
"vand.i32 d5, d2, d5 \n\t" //d5 = d2 & d5
|
||||
"vclt.f32 d18, d0, #0 \n\t" //d18 = (d0 < 0.0)
|
||||
"vcvt.f32.u32 d6, d5 \n\t" //d6 = (float) d5
|
||||
"vmls.f32 d1, d6, d3[1] \n\t" //d1 = d1 - d6 * d3[1]
|
||||
"veor.i32 d5, d5, d7 \n\t" //d5 = d5 ^ d7
|
||||
"vmul.f32 d2, d1, d1 \n\t" //d2 = d1*d1 = {x^2, x^2}
|
||||
|
||||
"vld1.32 {d16, d17}, [%1] \n\t" //q8 = {p7, p3, p5, p1}
|
||||
"veor.i32 d5, d5, d18 \n\t" //d5 = d5 ^ d18
|
||||
"vshl.i32 d5, d5, #31 \n\t" //d5 = d5 << 31
|
||||
"veor.i32 d1, d1, d5 \n\t" //d1 = d1 ^ d5
|
||||
|
||||
"vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4}
|
||||
"vmul.f32 q0, q8, d1[0] \n\t" //q0 = q8 * d1[0] = {p7x, p3x, p5x, p1x}
|
||||
"vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3}
|
||||
"vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d0 = {...., p1x + p3x^3 + p5x^5 + p7x^7}
|
||||
|
||||
"vmov.f32 s0, s3 \n\t" //s0 = s3
|
||||
:
|
||||
: "r"(__sinf_rng), "r"(__sinf_lut)
|
||||
: "q0", "q1", "q2", "q3", "q8", "q9"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float sinf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vdup.f32 d0, r0 \n\t");
|
||||
sinf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return sinf_c(x);
|
||||
#endif
|
||||
|
||||
};
|
||||
|
110
deps/math-neon/source/math_sinfv.c
vendored
110
deps/math-neon/source/math_sinfv.c
vendored
@ -1,110 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __sinfv_rng[2] = {
|
||||
2.0 / M_PI,
|
||||
M_PI / 2.0,
|
||||
};
|
||||
|
||||
const float __sinfv_lut[4] = {
|
||||
-0.00018365f, //p7
|
||||
-0.16664831f, //p3
|
||||
+0.00830636f, //p5
|
||||
+0.99999661f, //p1
|
||||
};
|
||||
|
||||
void sinfv_c(float *x, int n, float *r)
|
||||
{
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} ax, bx;
|
||||
|
||||
float aa, ab, ba, bb, axx, bxx;
|
||||
int am, bm, an, bn;
|
||||
|
||||
if (n & 0x1) {
|
||||
*r++ = sinf_c(*x++);
|
||||
n--;
|
||||
}
|
||||
|
||||
float rng0 = __sinfv_rng[0];
|
||||
float rng1 = __sinfv_rng[1];
|
||||
|
||||
while(n > 0){
|
||||
|
||||
float x0 = *x++;
|
||||
float x1 = *x++;
|
||||
|
||||
ax.f = fabsf(x0);
|
||||
bx.f = fabsf(x1);
|
||||
|
||||
//Range Reduction:
|
||||
am = (int) (ax.f * rng0);
|
||||
bm = (int) (bx.f * rng0);
|
||||
|
||||
ax.f = ax.f - (((float)am) * rng1);
|
||||
bx.f = bx.f - (((float)bm) * rng1);
|
||||
|
||||
//Test Quadrant
|
||||
an = am & 1;
|
||||
bn = bm & 1;
|
||||
ax.f = ax.f - an * rng1;
|
||||
bx.f = bx.f - bn * rng1;
|
||||
am = (am & 2) >> 1;
|
||||
bm = (bm & 2) >> 1;
|
||||
ax.i = ax.i ^ ((an ^ am ^ (x0 < 0)) << 31);
|
||||
bx.i = bx.i ^ ((bn ^ bm ^ (x1 < 0)) << 31);
|
||||
|
||||
//Taylor Polynomial (Estrins)
|
||||
axx = ax.f * ax.f;
|
||||
bxx = bx.f * bx.f;
|
||||
aa = (__sinfv_lut[0] * ax.f) * axx + (__sinfv_lut[2] * ax.f);
|
||||
ba = (__sinfv_lut[0] * bx.f) * bxx + (__sinfv_lut[2] * bx.f);
|
||||
ab = (__sinfv_lut[1] * ax.f) * axx + (__sinfv_lut[3] * ax.f);
|
||||
bb = (__sinfv_lut[1] * bx.f) * bxx + (__sinfv_lut[3] * bx.f);
|
||||
axx = axx * axx;
|
||||
bxx = bxx * bxx;
|
||||
*r++ = ab + aa * axx;
|
||||
*r++ = bb + ba * bxx;
|
||||
n -= 2;
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
void sinfv_neon(float *x, int n, float *r)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (""
|
||||
:
|
||||
:"r"(x), "r"(n)
|
||||
);
|
||||
#else
|
||||
sinfv_c(x, n, r);
|
||||
#endif
|
||||
}
|
120
deps/math-neon/source/math_sinhf.c
vendored
120
deps/math-neon/source/math_sinhf.c
vendored
@ -1,120 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __sinhf_rng[2] = {
|
||||
1.442695041f,
|
||||
0.693147180f
|
||||
};
|
||||
|
||||
const float __sinhf_lut[16] = {
|
||||
0.00019578093328483123, //p7
|
||||
0.00019578093328483123, //p7
|
||||
0.0014122663401803872, //p6
|
||||
0.0014122663401803872, //p6
|
||||
0.008336936973260111, //p5
|
||||
0.008336936973260111, //p5
|
||||
0.04165989275009526, //p4
|
||||
0.04165989275009526, //p4
|
||||
0.16666570253074878, //p3
|
||||
0.16666570253074878, //p3
|
||||
0.5000006143673624, //p2
|
||||
0.5000006143673624, //p2
|
||||
1.000000059694879, //p1
|
||||
1.000000059694879, //p1
|
||||
0.9999999916728642, //p0
|
||||
0.9999999916728642 //p0
|
||||
};
|
||||
|
||||
|
||||
float sinhf_c(float x)
|
||||
{
|
||||
float a, b, xx;
|
||||
xx = -x;
|
||||
a = expf_c(x);
|
||||
b = expf_c(xx);
|
||||
a = a - b;
|
||||
a = a * 0.5f;
|
||||
return a;
|
||||
}
|
||||
|
||||
|
||||
float sinhf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}
|
||||
"fnegs s1, s1 \n\t" //s1 = -s1
|
||||
|
||||
//Range Reduction:
|
||||
"vld1.32 d2, [%0] \n\t" //d2 = {invrange, range}
|
||||
"vld1.32 {d16, d17}, [%1]! \n\t"
|
||||
"vmul.f32 d6, d0, d2[0] \n\t" //d6 = d0 * d2[0]
|
||||
"vcvt.s32.f32 d6, d6 \n\t" //d6 = (int) d6
|
||||
"vld1.32 {d18}, [%1]! \n\t"
|
||||
"vcvt.f32.s32 d1, d6 \n\t" //d1 = (float) d6
|
||||
"vld1.32 {d19}, [%1]! \n\t"
|
||||
"vmls.f32 d0, d1, d2[1] \n\t" //d0 = d0 - d1 * d2[1]
|
||||
"vld1.32 {d20}, [%1]! \n\t"
|
||||
|
||||
//polynomial:
|
||||
"vmla.f32 d17, d16, d0 \n\t" //d17 = d17 + d16 * d0;
|
||||
"vld1.32 {d21}, [%1]! \n\t"
|
||||
"vmla.f32 d18, d17, d0 \n\t" //d18 = d18 + d17 * d0;
|
||||
"vld1.32 {d22}, [%1]! \n\t"
|
||||
"vmla.f32 d19, d18, d0 \n\t" //d19 = d19 + d18 * d0;
|
||||
"vld1.32 {d23}, [%1]! \n\t"
|
||||
"vmla.f32 d20, d19, d0 \n\t" //d20 = d20 + d19 * d0;
|
||||
"vmla.f32 d21, d20, d0 \n\t" //d21 = d21 + d20 * d0;
|
||||
"vmla.f32 d22, d21, d0 \n\t" //d22 = d22 + d21 * d0;
|
||||
"vmla.f32 d23, d22, d0 \n\t" //d23 = d23 + d22 * d0;
|
||||
|
||||
//multiply by 2 ^ m
|
||||
"vshl.i32 d6, d6, #23 \n\t" //d6 = d6 << 23
|
||||
"vadd.i32 d0, d23, d6 \n\t" //d0 = d22 + d6
|
||||
|
||||
"vdup.f32 d2, d0[1] \n\t" //d2 = s1
|
||||
"vmov.f32 d1, #0.5 \n\t" //d1 = 0.5
|
||||
"vsub.f32 d0, d0, d2 \n\t" //d0 = d0 - d2
|
||||
"vmul.f32 d0, d1 \n\t" //d0 = d0 * d1
|
||||
|
||||
:: "r"(__sinhf_rng), "r"(__sinhf_lut)
|
||||
: "d0", "d1", "q1", "q2", "d6"
|
||||
);
|
||||
|
||||
#endif
|
||||
}
|
||||
|
||||
float sinhf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
sinhf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return sinhf_c(x);
|
||||
#endif
|
||||
};
|
105
deps/math-neon/source/math_sqrtf.c
vendored
105
deps/math-neon/source/math_sqrtf.c
vendored
@ -1,105 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
/*
|
||||
Test func : sqrtf(x)
|
||||
Test Range: 0 < x < 1,000,000,000
|
||||
Peak Error: ~0.0010%
|
||||
RMS Error: ~0.0005%
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
float sqrtf_c(float x)
|
||||
{
|
||||
|
||||
float b, c;
|
||||
int m;
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} a;
|
||||
|
||||
//fast invsqrt approx
|
||||
a.f = x;
|
||||
a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5; //VRSQRTS
|
||||
a.f = a.f * b;
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5;
|
||||
a.f = a.f * b;
|
||||
|
||||
//fast inverse approx
|
||||
x = a.f;
|
||||
m = 0x3F800000 - (a.i & 0x7F800000);
|
||||
a.i = a.i + m;
|
||||
a.f = 1.41176471f - 0.47058824f * a.f;
|
||||
a.i = a.i + m;
|
||||
b = 2.0 - a.f * x;
|
||||
a.f = a.f * b;
|
||||
b = 2.0 - a.f * x;
|
||||
a.f = a.f * b;
|
||||
|
||||
return a.f;
|
||||
}
|
||||
|
||||
float sqrtf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
//fast invsqrt approx
|
||||
"vmov.f32 d1, d0 \n\t" //d1 = d0
|
||||
"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
|
||||
|
||||
//fast reciporical approximation
|
||||
"vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2;
|
||||
|
||||
::: "d0", "d1", "d2", "d3"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float sqrtf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
sqrtf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return sqrtf_c(x);
|
||||
#endif
|
||||
}
|
147
deps/math-neon/source/math_sqrtfv.c
vendored
147
deps/math-neon/source/math_sqrtfv.c
vendored
@ -1,147 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
/*
|
||||
Test func : sqrtf(x)
|
||||
Test Range: 0 < x < 1,000,000,000
|
||||
Peak Error: ~0.0010%
|
||||
RMS Error: ~0.0005%
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
void sqrtfv_c(float *x, int n, float *r)
|
||||
{
|
||||
|
||||
float x0, x1;
|
||||
float b0, b1, c0, c1;
|
||||
int m0, m1;
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} a0, a1;
|
||||
|
||||
|
||||
if (n & 0x1){
|
||||
*r++ = sqrtf_c(*x++);
|
||||
n--;
|
||||
}
|
||||
|
||||
while(n > 0){
|
||||
|
||||
x0 = *x++;
|
||||
x1 = *x++;
|
||||
|
||||
//fast invsqrt approx
|
||||
a0.f = x0;
|
||||
a1.f = x1;
|
||||
a0.i = 0x5F3759DF - (a0.i >> 1); //VRSQRTE
|
||||
a1.i = 0x5F3759DF - (a1.i >> 1); //VRSQRTE
|
||||
c0 = x0 * a0.f;
|
||||
c1 = x1 * a1.f;
|
||||
b0 = (3.0f - c0 * a0.f) * 0.5; //VRSQRTS
|
||||
b1 = (3.0f - c1 * a1.f) * 0.5; //VRSQRTS
|
||||
a0.f = a0.f * b0;
|
||||
a1.f = a1.f * b1;
|
||||
c0 = x0 * a0.f;
|
||||
c1 = x1 * a1.f;
|
||||
b0 = (3.0f - c0 * a0.f) * 0.5; //VRSQRTS
|
||||
b1 = (3.0f - c1 * a1.f) * 0.5; //VRSQRTS
|
||||
a0.f = a0.f * b0;
|
||||
a1.f = a1.f * b1;
|
||||
|
||||
//fast inverse approx
|
||||
c0 = a0.f;
|
||||
c0 = a1.f;
|
||||
m0 = 0x3F800000 - (a0.i & 0x7F800000);
|
||||
m1 = 0x3F800000 - (a1.i & 0x7F800000);
|
||||
a0.i = a0.i + m0;
|
||||
a1.i = a1.i + m1;
|
||||
a0.f = 1.41176471f - 0.47058824f * a0.f;
|
||||
a1.f = 1.41176471f - 0.47058824f * a1.f;
|
||||
a0.i = a0.i + m0;
|
||||
a1.i = a1.i + m1;
|
||||
b0 = 2.0 - a0.f * c0;
|
||||
b1 = 2.0 - a1.f * c1;
|
||||
a0.f = a0.f * b0;
|
||||
a1.f = a1.f * b1;
|
||||
b0 = 2.0 - a0.f * c0;
|
||||
b1 = 2.0 - a1.f * c1;
|
||||
a0.f = a0.f * b0;
|
||||
a1.f = a1.f * b1;
|
||||
|
||||
*r++ = a0.f;
|
||||
*r++ = a1.f;
|
||||
n -= 2;
|
||||
|
||||
}
|
||||
}
|
||||
|
||||
void sqrtfv_neon(float *x, int n, float *r)
|
||||
{
|
||||
#if 0
|
||||
asm volatile (
|
||||
|
||||
"tst r1, #1 \n\t" //r1 & 1
|
||||
"beq 1f \n\t" //
|
||||
|
||||
"vld1.32 d0[0], [r0]! \n\t" //s0 = *x++
|
||||
"mov ip, lr \n\t" //ip = lr
|
||||
//"bl sqrtf_neon_hfp \n\t" //sqrtf_neon
|
||||
"mov lr, ip \n\t" //lr = ip
|
||||
"vst1.32 d0[0], [r2]! \n\t" //*r++ = r0
|
||||
"subs r1, r1, #1 \n\t" //r1 = r1 - 1;
|
||||
"bxeq lr \n\t" //
|
||||
|
||||
"1: \n\t" //
|
||||
|
||||
"vld1.32 d0, [r0]! \n\t" //d0 = (*x[0], *x[1]), x+=2;
|
||||
|
||||
//fast invsqrt approx
|
||||
"vmov.f32 d1, d0 \n\t" //d1 = d0
|
||||
"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d3 = d0 * d2
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d3 = d0 * d2
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4
|
||||
|
||||
//fast reciporical approximation
|
||||
"vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2;
|
||||
|
||||
"vst1.64 d0, [r2]! \n\t" //*r++ = d0;
|
||||
"subs r1, r1, #2 \n\t" //n = n - 2; update flags
|
||||
"bgt 1b \n\t" //
|
||||
|
||||
::: "d0", "d1", "d2", "d3"
|
||||
);
|
||||
#else
|
||||
sqrtfv_c(x, n, r);
|
||||
#endif
|
||||
}
|
156
deps/math-neon/source/math_tanf.c
vendored
156
deps/math-neon/source/math_tanf.c
vendored
@ -1,156 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
const float __tanf_rng[2] = {
|
||||
2.0 / M_PI,
|
||||
M_PI / 2.0
|
||||
};
|
||||
|
||||
const float __tanf_lut[4] = {
|
||||
-0.00018365f, //p7
|
||||
-0.16664831f, //p3
|
||||
+0.00830636f, //p5
|
||||
+0.99999661f, //p1
|
||||
};
|
||||
|
||||
float tanf_c(float x){
|
||||
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} ax, c;
|
||||
|
||||
float r, a, b, xx, cc, cx;
|
||||
int m;
|
||||
|
||||
ax.f = fabsf(x);
|
||||
|
||||
//Range Reduction:
|
||||
m = (int) (ax.f * __tanf_rng[0]);
|
||||
ax.f = ax.f - (((float)m) * __tanf_rng[1]);
|
||||
|
||||
//Test Quadrant
|
||||
ax.f = ax.f - (m & 1) * __tanf_rng[1];
|
||||
ax.i = ax.i ^ ((*(int*)&x) & 0x80000000);
|
||||
|
||||
//Taylor Polynomial (Estrins)
|
||||
xx = ax.f * ax.f;
|
||||
a = (__tanf_lut[0] * ax.f) * xx + (__tanf_lut[2] * ax.f);
|
||||
b = (__tanf_lut[1] * ax.f) * xx + (__tanf_lut[3] * ax.f);
|
||||
xx = xx * xx;
|
||||
r = b + a * xx;
|
||||
|
||||
//cosine
|
||||
c.f = 1.0 - r * r;
|
||||
|
||||
//fast invsqrt approximation (2x newton iterations)
|
||||
cc = c.f;
|
||||
c.i = 0x5F3759DF - (c.i >> 1); //VRSQRTE
|
||||
cx = cc * c.f;
|
||||
a = (3.0f - cx * c.f) / 2; //VRSQRTS
|
||||
c.f = c.f * a;
|
||||
cx = cc * c.f;
|
||||
a = (3.0f - cx * c.f) / 2;
|
||||
c.f = c.f * a;
|
||||
|
||||
r = r * c.f;
|
||||
|
||||
return r;
|
||||
}
|
||||
|
||||
|
||||
float tanf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
|
||||
"vdup.f32 d0, d0[0] \n\t" //d0 = {x, x}
|
||||
"vabs.f32 d1, d0 \n\t" //d1 = {ax, ax}
|
||||
|
||||
//Range Reduction:
|
||||
"vld1.32 d3, [%0] \n\t" //d3 = {invrange, range}
|
||||
"vmul.f32 d2, d1, d3[0] \n\t" //d2 = d1 * d3[0]
|
||||
"vcvt.u32.f32 d2, d2 \n\t" //d2 = (int) d2
|
||||
"vcvt.f32.u32 d4, d2 \n\t" //d4 = (float) d2
|
||||
"vmls.f32 d1, d4, d3[1] \n\t" //d1 = d1 - d4 * d3[1]
|
||||
|
||||
//Checking Quadrant:
|
||||
//ax = ax - (k&1) * M_PI_2
|
||||
"vmov.i32 d4, #1 \n\t" //d4 = 1
|
||||
"vand.i32 d2, d2, d4 \n\t" //d2 = d2 & d4
|
||||
"vcvt.f32.u32 d2, d2 \n\t" //d2 = (float) d2
|
||||
"vmls.f32 d1, d2, d3[1] \n\t" //d1 = d1 - d2 * d3[1]
|
||||
|
||||
//ax = ax ^ ( x.i & 0x800000000)
|
||||
"vmov.i32 d4, #0x80000000 \n\t" //d4 = 0x80000000
|
||||
"vand.i32 d0, d0, d4 \n\t" //d0 = d0 & d4
|
||||
"veor.i32 d1, d1, d0 \n\t" //d1 = d1 ^ d0
|
||||
|
||||
//polynomial:
|
||||
"vmul.f32 d2, d1, d1 \n\t" //d2 = d1*d1 = {x^2, x^2}
|
||||
"vld1.32 {d4, d5}, [%1] \n\t" //d4 = {p7, p3}, d5 = {p5, p1}
|
||||
"vmul.f32 d3, d2, d2 \n\t" //d3 = d2*d2 = {x^4, x^4}
|
||||
"vmul.f32 q0, q2, d1[0] \n\t" //q0 = q2 * d1[0] = {p7x, p3x, p5x, p1x}
|
||||
"vmla.f32 d1, d0, d2[0] \n\t" //d1 = d1 + d0*d2 = {p5x + p7x^3, p1x + p3x^3}
|
||||
"vmla.f32 d1, d3, d1[0] \n\t" //d1 = d1 + d3*d0 = {..., p1x + p3x^3 + p5x^5 + p7x^7}
|
||||
|
||||
//cosine
|
||||
"vmov.f32 s1, #1.0 \n\t" //d0[1] = 1.0
|
||||
"vmls.f32 d0, d1, d1 \n\t" //d0 = {..., 1.0 - sx*sx}
|
||||
|
||||
//invsqrt approx
|
||||
"vmov.f32 d2, d0 \n\t" //d2 = d0
|
||||
"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
|
||||
"vmul.f32 d3, d0, d2 \n\t" //d3 = d0 * d2
|
||||
"vrsqrts.f32 d4, d3, d0 \n\t" //d4 = (3 - d0 * d3) / 2
|
||||
"vmul.f32 d0, d0, d4 \n\t" //d0 = d0 * d4
|
||||
"vmul.f32 d3, d0, d2 \n\t" //d3 = d0 * d2
|
||||
"vrsqrts.f32 d4, d3, d0 \n\t" //d4 = (3 - d0 * d3) / 2
|
||||
"vmul.f32 d0, d0, d4 \n\t" //d0 = d0 * d4
|
||||
|
||||
"vmul.f32 d0, d0, d1 \n\t" //d0 = d0 * d1
|
||||
|
||||
"vmov.f32 s0, s1 \n\t" //s0 = s1
|
||||
|
||||
:: "r"(__tanf_rng), "r"(__tanf_lut)
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
||||
float tanf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vdup.f32 d0, r0 \n\t");
|
||||
tanf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return tanf_c(x);
|
||||
#endif
|
||||
};
|
||||
|
95
deps/math-neon/source/math_tanhf.c
vendored
95
deps/math-neon/source/math_tanhf.c
vendored
@ -1,95 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math.h"
|
||||
#include "math_neon.h"
|
||||
|
||||
/*
|
||||
TanH = (e^x - e^-x) / (e^x + e^-x)
|
||||
TanH = (e^x - e^-x)(e^x) / (e^x + e^-x)(e^x)
|
||||
TanH = (e^2x - 1) / (e^2x + 1)
|
||||
|
||||
*/
|
||||
|
||||
float tanhf_c(float x)
|
||||
{
|
||||
float a, b, c;
|
||||
int m;
|
||||
union{
|
||||
float f;
|
||||
int i;
|
||||
} xx;
|
||||
|
||||
x = 2.0f * x;
|
||||
a = expf_c(x);
|
||||
c = a + 1.0f;
|
||||
|
||||
//reciporical approx.
|
||||
xx.f = c;
|
||||
m = 0x3F800000 - (xx.i & 0x7F800000);
|
||||
xx.i = xx.i + m;
|
||||
xx.f = 1.41176471f - 0.47058824f * xx.f;
|
||||
xx.i = xx.i + m;
|
||||
b = 2.0 - xx.f * c;
|
||||
xx.f = xx.f * b;
|
||||
b = 2.0 - xx.f * c;
|
||||
xx.f = xx.f * b;
|
||||
c = a - 1.0;
|
||||
xx.f *= c;
|
||||
return xx.f;
|
||||
}
|
||||
|
||||
|
||||
float tanhf_neon_hfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vadd.f32 d0, d0, d0 \n\t");
|
||||
expf_neon_hfp(x);
|
||||
asm volatile (
|
||||
"vmov.f32 d2, #1.0 \n\t"
|
||||
"vsub.f32 d3, d0, d2 \n\t"
|
||||
"vadd.f32 d0, d0, d2 \n\t"
|
||||
|
||||
"vrecpe.f32 d1, d0 \n\t" //d1 = ~ 1 / d0;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d1, d1, d2 \n\t" //d1 = d1 * d2;
|
||||
"vrecps.f32 d2, d1, d0 \n\t" //d2 = 2.0 - d1 * d0;
|
||||
"vmul.f32 d0, d1, d2 \n\t" //d0 = d1 * d2;
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3;
|
||||
::: "d0", "d1", "d2", "d3"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float tanhf_neon_sfp(float x)
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile ("vmov.f32 s0, r0 \n\t");
|
||||
tanhf_neon_hfp(x);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return tanhf_c(x);
|
||||
#endif
|
||||
};
|
||||
|
118
deps/math-neon/source/math_vec2.c
vendored
118
deps/math-neon/source/math_vec2.c
vendored
@ -1,118 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
//vec2 scalar product
|
||||
float
|
||||
dot2_c(float v0[2], float v1[2])
|
||||
{
|
||||
float r;
|
||||
r = v0[0]*v1[0];
|
||||
r += v0[1]*v1[1];
|
||||
return r;
|
||||
}
|
||||
|
||||
void
|
||||
normalize2_c(float v[2], float d[2])
|
||||
{
|
||||
float b, c, x;
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} a;
|
||||
|
||||
x = v[0]*v[0];
|
||||
x += v[1]*v[1];
|
||||
|
||||
//fast invsqrt approx
|
||||
a.f = x;
|
||||
a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5; //VRSQRTS
|
||||
a.f = a.f * b;
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5;
|
||||
a.f = a.f * b;
|
||||
|
||||
d[0] = v[0]*a.f;
|
||||
d[1] = v[1]*a.f;
|
||||
}
|
||||
|
||||
float
|
||||
dot2_neon_hfp(float v0[2], float v1[2])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d2}, [%0] \n\t" //d2={x0,y0}
|
||||
"vld1.32 {d4}, [%1] \n\t" //d4={x1,y1}
|
||||
"vmul.f32 d0, d2, d4 \n\t" //d0 = d2*d4
|
||||
"vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1]
|
||||
:: "r"(v0), "r"(v1)
|
||||
:
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float
|
||||
dot2_neon_sfp(float v0[2], float v1[2])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
dot2_neon_hfp(v0, v1);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return dot2_c(v0, v1);
|
||||
#endif
|
||||
};
|
||||
|
||||
void
|
||||
normalize2_neon(float v[2], float d[2])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 d4, [%0] \n\t" //d4 = {x0,y0}
|
||||
"vmul.f32 d0, d4, d4 \n\t" //d0 = d2*d2
|
||||
"vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1]
|
||||
|
||||
"vmov.f32 d1, d0 \n\t" //d1 = d0
|
||||
"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
|
||||
|
||||
"vmul.f32 d4, d4, d0[0] \n\t" //d4 = d4*d0[0]
|
||||
"vst1.32 d4, [%1] \n\t" //
|
||||
|
||||
:: "r"(v), "r"(d)
|
||||
: "d0", "d1", "d2", "d3", "d4", "memory"
|
||||
);
|
||||
#else
|
||||
normalize2_c(v, d);
|
||||
#endif
|
||||
}
|
||||
|
172
deps/math-neon/source/math_vec3.c
vendored
172
deps/math-neon/source/math_vec3.c
vendored
@ -1,172 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
//vec4 scalar product
|
||||
float
|
||||
dot3_c(float v0[3], float v1[3])
|
||||
{
|
||||
float r;
|
||||
r = v0[0]*v1[0];
|
||||
r += v0[1]*v1[1];
|
||||
r += v0[2]*v1[2];
|
||||
return r;
|
||||
}
|
||||
|
||||
void
|
||||
cross3_c(float v0[3], float v1[3], float d[3])
|
||||
{
|
||||
d[0] = v0[1]*v1[2] - v0[2]*v1[1];
|
||||
d[1] = v0[2]*v1[0] - v0[0]*v1[2];
|
||||
d[2] = v0[0]*v1[1] - v0[1]*v1[0];
|
||||
}
|
||||
|
||||
void
|
||||
normalize3_c(float v[3], float d[3])
|
||||
{
|
||||
float b, c, x;
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} a;
|
||||
|
||||
x = v[0]*v[0];
|
||||
x += v[1]*v[1];
|
||||
x += v[2]*v[2];
|
||||
|
||||
//fast invsqrt approx
|
||||
a.f = x;
|
||||
a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5; //VRSQRTS
|
||||
a.f = a.f * b;
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5;
|
||||
a.f = a.f * b;
|
||||
|
||||
d[0] = v[0]*a.f;
|
||||
d[1] = v[1]*a.f;
|
||||
d[2] = v[2]*a.f;
|
||||
}
|
||||
|
||||
|
||||
float
|
||||
dot3_neon_hfp(float v0[3], float v1[3])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d2}, [%0] \n\t" //d2={x0,y0}
|
||||
"flds s6, [%0, #8] \n\t" //d3[0]={z0}
|
||||
"vld1.32 {d4}, [%1] \n\t" //d4={x1,y1}
|
||||
"flds s10, [%1, #8] \n\t" //d5[0]={z1}
|
||||
|
||||
"vmul.f32 d0, d2, d4 \n\t" //d0= d2*d4
|
||||
"vpadd.f32 d0, d0, d0 \n\t" //d0 = d[0] + d[1]
|
||||
"vmla.f32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5
|
||||
:: "r"(v0), "r"(v1)
|
||||
: "d0","d1","d2","d3","d4","d5"
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float
|
||||
dot3_neon_sfp(float v0[3], float v1[3])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
dot3_neon_hfp(v0, v1);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return dot3_c(v0, v1);
|
||||
#endif
|
||||
};
|
||||
|
||||
|
||||
void cross3_neon(float v0[3], float v1[3], float d[3])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"flds s3, [%0] \n\t" //d1[1]={x0}
|
||||
"add %0, %0, #4 \n\t" //
|
||||
"vld1.32 {d0}, [%0] \n\t" //d0={y0,z0}
|
||||
"vmov.f32 s2, s1 \n\t" //d1[0]={z0}
|
||||
|
||||
"flds s5, [%1] \n\t" //d2[1]={x1}
|
||||
"add %1, %1, #4 \n\t" //
|
||||
"vld1.32 {d3}, [%1] \n\t" //d3={y1,z1}
|
||||
"vmov.f32 s4, s7 \n\t" //d2[0]=d3[1]
|
||||
|
||||
"vmul.f32 d4, d0, d2 \n\t" //d4=d0*d2
|
||||
"vmls.f32 d4, d1, d3 \n\t" //d4-=d1*d3
|
||||
|
||||
"vmul.f32 d5, d3, d1[1] \n\t" //d5=d3*d1[1]
|
||||
"vmls.f32 d5, d0, d2[1] \n\t" //d5-=d0*d2[1]
|
||||
|
||||
"vst1.32 d4, [%2] \n\t" //
|
||||
"add %2, %2, #8 \n\t" //
|
||||
"fsts s10, [%2] \n\t" //
|
||||
|
||||
: "+r"(v0), "+r"(v1), "+r"(d):
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "memory"
|
||||
);
|
||||
#else
|
||||
cross3_c(v0,v1,d);
|
||||
#endif
|
||||
}
|
||||
|
||||
void
|
||||
normalize3_neon(float v[3], float d[3])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d4}, [%0] \n\t" //d4={x0,y0}
|
||||
"flds s10, [%0, #8] \n\t" //d5[0]={z0}
|
||||
|
||||
"vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4
|
||||
"vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1]
|
||||
"vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5
|
||||
|
||||
"vmov.f32 d1, d0 \n\t" //d1 = d0
|
||||
"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4
|
||||
|
||||
"vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4
|
||||
"vst1.32 {d4}, [%1] \n\t" //
|
||||
"fsts s10, [%1, #8] \n\t" //
|
||||
|
||||
:: "r"(v), "r"(d)
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "memory"
|
||||
);
|
||||
#else
|
||||
normalize3_c(v, d);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
126
deps/math-neon/source/math_vec4.c
vendored
126
deps/math-neon/source/math_vec4.c
vendored
@ -1,126 +0,0 @@
|
||||
/*
|
||||
The MIT License (MIT)
|
||||
|
||||
Copyright (c) 2015 Lachlan Tychsen-Smith (lachlan.ts@gmail.com)
|
||||
|
||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||
of this software and associated documentation files (the "Software"), to deal
|
||||
in the Software without restriction, including without limitation the rights
|
||||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
|
||||
copies of the Software, and to permit persons to whom the Software is
|
||||
furnished to do so, subject to the following conditions:
|
||||
|
||||
The above copyright notice and this permission notice shall be included in
|
||||
all copies or substantial portions of the Software.
|
||||
|
||||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
|
||||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
|
||||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
|
||||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
|
||||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
|
||||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
|
||||
THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include "math_neon.h"
|
||||
|
||||
|
||||
#ifdef __MATH_NEON
|
||||
#include "arm_neon.h"
|
||||
#endif
|
||||
|
||||
//vec4 scalar product
|
||||
float dot4_c(float v0[4], float v1[4])
|
||||
{
|
||||
float r;
|
||||
r = v0[0]*v1[0];
|
||||
r += v0[1]*v1[1];
|
||||
r += v0[2]*v1[2];
|
||||
r += v0[3]*v1[3];
|
||||
return r;
|
||||
}
|
||||
|
||||
void normalize4_c(float v[4], float d[4])
|
||||
{
|
||||
float b, c, x;
|
||||
union {
|
||||
float f;
|
||||
int i;
|
||||
} a;
|
||||
|
||||
x = v[0]*v[0];
|
||||
x += v[1]*v[1];
|
||||
x += v[2]*v[2];
|
||||
x += v[3]*v[3];
|
||||
|
||||
//fast invsqrt approx
|
||||
a.f = x;
|
||||
a.i = 0x5F3759DF - (a.i >> 1); //VRSQRTE
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5; //VRSQRTS
|
||||
a.f = a.f * b;
|
||||
c = x * a.f;
|
||||
b = (3.0f - c * a.f) * 0.5;
|
||||
a.f = a.f * b;
|
||||
|
||||
d[0] = v[0]*a.f;
|
||||
d[1] = v[1]*a.f;
|
||||
d[2] = v[2]*a.f;
|
||||
d[3] = v[3]*a.f;
|
||||
}
|
||||
|
||||
void normalize4_neon(float v[4], float d[4])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d4, d5}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0}
|
||||
"vmul.f32 d0, d4, d4 \n\t" //d0= d4*d4
|
||||
"vmla.f32 d0, d5, d5 \n\t" //d0 = d0 + d5*d5
|
||||
"vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1]
|
||||
|
||||
"vmov.f32 d1, d0 \n\t" //d1 = d0
|
||||
"vrsqrte.f32 d0, d0 \n\t" //d0 = ~ 1.0 / sqrt(d0)
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d3 = (3 - d0 * d2) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d3
|
||||
"vmul.f32 d2, d0, d1 \n\t" //d2 = d0 * d1
|
||||
"vrsqrts.f32 d3, d2, d0 \n\t" //d4 = (3 - d0 * d3) / 2
|
||||
"vmul.f32 d0, d0, d3 \n\t" //d0 = d0 * d4
|
||||
|
||||
"vmul.f32 q2, q2, d0[0] \n\t" //d0= d2*d4
|
||||
"vst1.32 {d4, d5}, [%1] \n\t" //d2={x0,y0}, d3={z0, w0}
|
||||
|
||||
:: "r"(v), "r"(d)
|
||||
: "d0", "d1", "d2", "d3", "d4", "d5", "memory"
|
||||
);
|
||||
#else
|
||||
normalize4_c(v, d);
|
||||
#endif
|
||||
|
||||
}
|
||||
|
||||
|
||||
float dot4_neon_hfp(float v0[4], float v1[4])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
asm volatile (
|
||||
"vld1.32 {d2, d3}, [%0] \n\t" //d2={x0,y0}, d3={z0, w0}
|
||||
"vld1.32 {d4, d5}, [%1] \n\t" //d4={x1,y1}, d5={z1, w1}
|
||||
"vmul.f32 d0, d2, d4 \n\t" //d0= d2*d4
|
||||
"vmla.f32 d0, d3, d5 \n\t" //d0 = d0 + d3*d5
|
||||
"vpadd.f32 d0, d0 \n\t" //d0 = d[0] + d[1]
|
||||
:: "r"(v0), "r"(v1) :
|
||||
);
|
||||
#endif
|
||||
}
|
||||
|
||||
float dot4_neon_sfp(float v0[4], float v1[4])
|
||||
{
|
||||
#ifdef __MATH_NEON
|
||||
dot4_neon_hfp(v0, v1);
|
||||
asm volatile ("vmov.f32 r0, s0 \n\t");
|
||||
#else
|
||||
return dot4_c(v0, v1);
|
||||
#endif
|
||||
};
|
||||
|
Loading…
x
Reference in New Issue
Block a user