finish matching fdlibm files

fdlibm complete!
2025-01-22 14:04:23 +00:00 · 2022-02-11 15:38:29 -05:00 · 2022-02-11 15:38:29 -05:00 · 6b9f925f74
commit 6b9f925f74
parent 46774da475
5 changed files with 461 additions and 581 deletions
--- a/asm/Dolphin/e_asin.s
+++ b/asm/Dolphin/e_asin.s
@ -1,226 +0,0 @@
-.include "macros.inc"
-.section .sdata2, "a"     # 0x80516360 - 0x80520E40
-.balign 8
-.global lbl_805170F0
-lbl_805170F0:
-	.4byte 0x3FF921FB
-	.4byte 0x54442D18
-.global lbl_805170F8
-lbl_805170F8:
-	.4byte 0x3C91A626
-	.4byte 0x33145C07
-.global lbl_80517100
-lbl_80517100:
-	.4byte 0x7E37E43C
-	.4byte 0x8800759C
-.global lbl_80517108
-lbl_80517108:
-	.4byte 0x3FF00000
-	.4byte 0x00000000
-.global lbl_80517110
-lbl_80517110:
-	.4byte 0x3FC55555
-	.4byte 0x55555555
-.global lbl_80517118
-lbl_80517118:
-	.4byte 0xBFD4D612
-	.4byte 0x03EB6F7D
-.global lbl_80517120
-lbl_80517120:
-	.4byte 0x3FC9C155
-	.4byte 0x0E884455
-.global lbl_80517128
-lbl_80517128:
-	.4byte 0xBFA48228
-	.4byte 0xB5688F3B
-.global lbl_80517130
-lbl_80517130:
-	.4byte 0x3F49EFE0
-	.4byte 0x7501B288
-.global lbl_80517138
-lbl_80517138:
-	.4byte 0x3F023DE1
-	.4byte 0x0DFDF709
-.global lbl_80517140
-lbl_80517140:
-	.4byte 0xC0033A27
-	.4byte 0x1C8A2D4B
-.global lbl_80517148
-lbl_80517148:
-	.4byte 0x40002AE5
-	.4byte 0x9C598AC8
-.global lbl_80517150
-lbl_80517150:
-	.4byte 0xBFE6066C
-	.4byte 0x1B8D0159
-.global lbl_80517158
-lbl_80517158:
-	.4byte 0x3FB3B8C5
-	.4byte 0xB12E9282
-.global lbl_80517160
-lbl_80517160:
-	.4byte 0x3FE00000
-	.4byte 0x00000000
-.global lbl_80517168
-lbl_80517168:
-	.4byte 0x40000000
-	.4byte 0x00000000
-.global lbl_80517170
-lbl_80517170:
-	.4byte 0x3FE921FB
-	.4byte 0x54442D18
-
-.section .text, "ax"  # 0x800056C0 - 0x80472F00
-.global __ieee754_asin
-__ieee754_asin:
-/* 800CC3B0 000C92F0  94 21 FF B0 */	stwu r1, -0x50(r1)
-/* 800CC3B4 000C92F4  7C 08 02 A6 */	mflr r0
-/* 800CC3B8 000C92F8  90 01 00 54 */	stw r0, 0x54(r1)
-/* 800CC3BC 000C92FC  DB E1 00 40 */	stfd f31, 0x40(r1)
-/* 800CC3C0 000C9300  F3 E1 00 48 */	psq_st f31, 72(r1), 0, qr0
-/* 800CC3C4 000C9304  DB C1 00 30 */	stfd f30, 0x30(r1)
-/* 800CC3C8 000C9308  F3 C1 00 38 */	psq_st f30, 56(r1), 0, qr0
-/* 800CC3CC 000C930C  DB A1 00 20 */	stfd f29, 0x20(r1)
-/* 800CC3D0 000C9310  F3 A1 00 28 */	psq_st f29, 40(r1), 0, qr0
-/* 800CC3D4 000C9314  93 E1 00 1C */	stw r31, 0x1c(r1)
-/* 800CC3D8 000C9318  93 C1 00 18 */	stw r30, 0x18(r1)
-/* 800CC3DC 000C931C  D8 21 00 08 */	stfd f1, 8(r1)
-/* 800CC3E0 000C9320  3C 00 3F F0 */	lis r0, 0x3ff0
-/* 800CC3E4 000C9324  83 E1 00 08 */	lwz r31, 8(r1)
-/* 800CC3E8 000C9328  57 FE 00 7E */	clrlwi r30, r31, 1
-/* 800CC3EC 000C932C  7C 1E 00 00 */	cmpw r30, r0
-/* 800CC3F0 000C9330  41 80 00 34 */	blt lbl_800CC424
-/* 800CC3F4 000C9334  80 01 00 0C */	lwz r0, 0xc(r1)
-/* 800CC3F8 000C9338  3C 7E C0 10 */	addis r3, r30, 0xc010
-/* 800CC3FC 000C933C  7C 60 03 79 */	or. r0, r3, r0
-/* 800CC400 000C9340  40 82 00 18 */	bne lbl_800CC418
-/* 800CC404 000C9344  C8 02 8D 98 */	lfd f0, lbl_805170F8@sda21(r2)
-/* 800CC408 000C9348  C8 42 8D 90 */	lfd f2, lbl_805170F0@sda21(r2)
-/* 800CC40C 000C934C  FC 00 00 72 */	fmul f0, f0, f1
-/* 800CC410 000C9350  FC 22 00 7A */	fmadd f1, f2, f1, f0
-/* 800CC414 000C9354  48 00 01 A4 */	b lbl_800CC5B8
-lbl_800CC418:
-/* 800CC418 000C9358  3C 60 80 51 */	lis r3, __float_nan@ha
-/* 800CC41C 000C935C  C0 23 48 B0 */	lfs f1, __float_nan@l(r3)
-/* 800CC420 000C9360  48 00 01 98 */	b lbl_800CC5B8
-lbl_800CC424:
-/* 800CC424 000C9364  3C 00 3F E0 */	lis r0, 0x3fe0
-/* 800CC428 000C9368  7C 1E 00 00 */	cmpw r30, r0
-/* 800CC42C 000C936C  40 80 00 94 */	bge lbl_800CC4C0
-/* 800CC430 000C9370  3C 00 3E 40 */	lis r0, 0x3e40
-/* 800CC434 000C9374  7C 1E 00 00 */	cmpw r30, r0
-/* 800CC438 000C9378  40 80 00 1C */	bge lbl_800CC454
-/* 800CC43C 000C937C  C8 42 8D A0 */	lfd f2, lbl_80517100@sda21(r2)
-/* 800CC440 000C9380  C8 02 8D A8 */	lfd f0, lbl_80517108@sda21(r2)
-/* 800CC444 000C9384  FC 42 08 2A */	fadd f2, f2, f1
-/* 800CC448 000C9388  FC 02 00 40 */	fcmpo cr0, f2, f0
-/* 800CC44C 000C938C  40 81 00 0C */	ble lbl_800CC458
-/* 800CC450 000C9390  48 00 01 68 */	b lbl_800CC5B8
-lbl_800CC454:
-/* 800CC454 000C9394  FF E1 00 72 */	fmul f31, f1, f1
-lbl_800CC458:
-/* 800CC458 000C9398  C8 22 8D D8 */	lfd f1, lbl_80517138@sda21(r2)
-/* 800CC45C 000C939C  C8 02 8D D0 */	lfd f0, lbl_80517130@sda21(r2)
-/* 800CC460 000C93A0  C8 42 8D C8 */	lfd f2, lbl_80517128@sda21(r2)
-/* 800CC464 000C93A4  FC 61 07 FA */	fmadd f3, f1, f31, f0
-/* 800CC468 000C93A8  C8 C2 8D C0 */	lfd f6, lbl_80517120@sda21(r2)
-/* 800CC46C 000C93AC  C8 22 8D F8 */	lfd f1, lbl_80517158@sda21(r2)
-/* 800CC470 000C93B0  C8 02 8D F0 */	lfd f0, lbl_80517150@sda21(r2)
-/* 800CC474 000C93B4  C8 A2 8D B8 */	lfd f5, lbl_80517118@sda21(r2)
-/* 800CC478 000C93B8  FC FF 10 FA */	fmadd f7, f31, f3, f2
-/* 800CC47C 000C93BC  C8 42 8D E8 */	lfd f2, lbl_80517148@sda21(r2)
-/* 800CC480 000C93C0  FC 61 07 FA */	fmadd f3, f1, f31, f0
-/* 800CC484 000C93C4  C8 82 8D B0 */	lfd f4, lbl_80517110@sda21(r2)
-/* 800CC488 000C93C8  C8 22 8D E0 */	lfd f1, lbl_80517140@sda21(r2)
-/* 800CC48C 000C93CC  FC DF 31 FA */	fmadd f6, f31, f7, f6
-/* 800CC490 000C93D0  C8 02 8D A8 */	lfd f0, lbl_80517108@sda21(r2)
-/* 800CC494 000C93D4  FC 5F 10 FA */	fmadd f2, f31, f3, f2
-/* 800CC498 000C93D8  C8 E1 00 08 */	lfd f7, 8(r1)
-/* 800CC49C 000C93DC  FC 7F 29 BA */	fmadd f3, f31, f6, f5
-/* 800CC4A0 000C93E0  FC 3F 08 BA */	fmadd f1, f31, f2, f1
-/* 800CC4A4 000C93E4  FC 5F 20 FA */	fmadd f2, f31, f3, f4
-/* 800CC4A8 000C93E8  FC 1F 00 7A */	fmadd f0, f31, f1, f0
-/* 800CC4AC 000C93EC  FC 3F 00 B2 */	fmul f1, f31, f2
-/* 800CC4B0 000C93F0  FC 01 00 24 */	fdiv f0, f1, f0
-/* 800CC4B4 000C93F4  FC 27 38 3A */	fmadd f1, f7, f0, f7
-/* 800CC4B8 000C93F8  D8 01 00 10 */	stfd f0, 0x10(r1)
-/* 800CC4BC 000C93FC  48 00 00 FC */	b lbl_800CC5B8
-lbl_800CC4C0:
-/* 800CC4C0 000C9400  FC 20 0A 10 */	fabs f1, f1
-/* 800CC4C4 000C9404  C9 22 8D A8 */	lfd f9, lbl_80517108@sda21(r2)
-/* 800CC4C8 000C9408  C8 02 8E 00 */	lfd f0, lbl_80517160@sda21(r2)
-/* 800CC4CC 000C940C  C8 E2 8D D8 */	lfd f7, lbl_80517138@sda21(r2)
-/* 800CC4D0 000C9410  FD 09 08 28 */	fsub f8, f9, f1
-/* 800CC4D4 000C9414  C8 62 8D D0 */	lfd f3, lbl_80517130@sda21(r2)
-/* 800CC4D8 000C9418  C8 C2 8D C8 */	lfd f6, lbl_80517128@sda21(r2)
-/* 800CC4DC 000C941C  C8 A2 8D C0 */	lfd f5, lbl_80517120@sda21(r2)
-/* 800CC4E0 000C9420  FF E0 02 32 */	fmul f31, f0, f8
-/* 800CC4E4 000C9424  C8 42 8D F8 */	lfd f2, lbl_80517158@sda21(r2)
-/* 800CC4E8 000C9428  C8 02 8D F0 */	lfd f0, lbl_80517150@sda21(r2)
-/* 800CC4EC 000C942C  C8 82 8D B8 */	lfd f4, lbl_80517118@sda21(r2)
-/* 800CC4F0 000C9430  C8 22 8D E8 */	lfd f1, lbl_80517148@sda21(r2)
-/* 800CC4F4 000C9434  FC E7 1F FA */	fmadd f7, f7, f31, f3
-/* 800CC4F8 000C9438  C8 62 8D B0 */	lfd f3, lbl_80517110@sda21(r2)
-/* 800CC4FC 000C943C  FC 42 07 FA */	fmadd f2, f2, f31, f0
-/* 800CC500 000C9440  C8 02 8D E0 */	lfd f0, lbl_80517140@sda21(r2)
-/* 800CC504 000C9444  D9 01 00 10 */	stfd f8, 0x10(r1)
-/* 800CC508 000C9448  FC DF 31 FA */	fmadd f6, f31, f7, f6
-/* 800CC50C 000C944C  FC 3F 08 BA */	fmadd f1, f31, f2, f1
-/* 800CC510 000C9450  FC 5F 29 BA */	fmadd f2, f31, f6, f5
-/* 800CC514 000C9454  FC 1F 00 7A */	fmadd f0, f31, f1, f0
-/* 800CC518 000C9458  FC 3F 20 BA */	fmadd f1, f31, f2, f4
-/* 800CC51C 000C945C  FF BF 48 3A */	fmadd f29, f31, f0, f9
-/* 800CC520 000C9460  FC 1F 18 7A */	fmadd f0, f31, f1, f3
-/* 800CC524 000C9464  FC 20 F8 90 */	fmr f1, f31
-/* 800CC528 000C9468  FF DF 00 32 */	fmul f30, f31, f0
-/* 800CC52C 000C946C  48 00 37 91 */	bl sqrt
-/* 800CC530 000C9470  3C 60 3F EF */	lis r3, 0x3FEF3333@ha
-/* 800CC534 000C9474  38 03 33 33 */	addi r0, r3, 0x3FEF3333@l
-/* 800CC538 000C9478  7C 1E 00 00 */	cmpw r30, r0
-/* 800CC53C 000C947C  41 80 00 28 */	blt lbl_800CC564
-/* 800CC540 000C9480  FC 9E E8 24 */	fdiv f4, f30, f29
-/* 800CC544 000C9484  C8 42 8E 08 */	lfd f2, lbl_80517168@sda21(r2)
-/* 800CC548 000C9488  C8 02 8D 98 */	lfd f0, lbl_805170F8@sda21(r2)
-/* 800CC54C 000C948C  C8 62 8D 90 */	lfd f3, lbl_805170F0@sda21(r2)
-/* 800CC550 000C9490  FC 21 09 3A */	fmadd f1, f1, f4, f1
-/* 800CC554 000C9494  D8 81 00 10 */	stfd f4, 0x10(r1)
-/* 800CC558 000C9498  FC 02 00 78 */	fmsub f0, f2, f1, f0
-/* 800CC55C 000C949C  FC 23 00 28 */	fsub f1, f3, f0
-/* 800CC560 000C94A0  48 00 00 48 */	b lbl_800CC5A8
-lbl_800CC564:
-/* 800CC564 000C94A4  D8 21 00 10 */	stfd f1, 0x10(r1)
-/* 800CC568 000C94A8  38 00 00 00 */	li r0, 0
-/* 800CC56C 000C94AC  C8 E2 8E 08 */	lfd f7, lbl_80517168@sda21(r2)
-/* 800CC570 000C94B0  FC BE E8 24 */	fdiv f5, f30, f29
-/* 800CC574 000C94B4  90 01 00 14 */	stw r0, 0x14(r1)
-/* 800CC578 000C94B8  C8 02 8D 98 */	lfd f0, lbl_805170F8@sda21(r2)
-/* 800CC57C 000C94BC  C9 01 00 10 */	lfd f8, 0x10(r1)
-/* 800CC580 000C94C0  C8 42 8E 10 */	lfd f2, lbl_80517170@sda21(r2)
-/* 800CC584 000C94C4  FC 88 FA 3C */	fnmsub f4, f8, f8, f31
-/* 800CC588 000C94C8  FC 61 40 2A */	fadd f3, f1, f8
-/* 800CC58C 000C94CC  FC C7 00 72 */	fmul f6, f7, f1
-/* 800CC590 000C94D0  FC 24 18 24 */	fdiv f1, f4, f3
-/* 800CC594 000C94D4  FC 27 00 7C */	fnmsub f1, f7, f1, f0
-/* 800CC598 000C94D8  FC 07 12 3C */	fnmsub f0, f7, f8, f2
-/* 800CC59C 000C94DC  FC 26 09 78 */	fmsub f1, f6, f5, f1
-/* 800CC5A0 000C94E0  FC 01 00 28 */	fsub f0, f1, f0
-/* 800CC5A4 000C94E4  FC 22 00 28 */	fsub f1, f2, f0
-lbl_800CC5A8:
-/* 800CC5A8 000C94E8  2C 1F 00 00 */	cmpwi r31, 0
-/* 800CC5AC 000C94EC  40 81 00 08 */	ble lbl_800CC5B4
-/* 800CC5B0 000C94F0  48 00 00 08 */	b lbl_800CC5B8
-lbl_800CC5B4:
-/* 800CC5B4 000C94F4  FC 20 08 50 */	fneg f1, f1
-lbl_800CC5B8:
-/* 800CC5B8 000C94F8  E3 E1 00 48 */	psq_l f31, 72(r1), 0, qr0
-/* 800CC5BC 000C94FC  CB E1 00 40 */	lfd f31, 0x40(r1)
-/* 800CC5C0 000C9500  E3 C1 00 38 */	psq_l f30, 56(r1), 0, qr0
-/* 800CC5C4 000C9504  CB C1 00 30 */	lfd f30, 0x30(r1)
-/* 800CC5C8 000C9508  E3 A1 00 28 */	psq_l f29, 40(r1), 0, qr0
-/* 800CC5CC 000C950C  CB A1 00 20 */	lfd f29, 0x20(r1)
-/* 800CC5D0 000C9510  83 E1 00 1C */	lwz r31, 0x1c(r1)
-/* 800CC5D4 000C9514  80 01 00 54 */	lwz r0, 0x54(r1)
-/* 800CC5D8 000C9518  83 C1 00 18 */	lwz r30, 0x18(r1)
-/* 800CC5DC 000C951C  7C 08 03 A6 */	mtlr r0
-/* 800CC5E0 000C9520  38 21 00 50 */	addi r1, r1, 0x50
-/* 800CC5E4 000C9524  4E 80 00 20 */	blr 
--- a/asm/Dolphin/e_sqrt.s
+++ b/asm/Dolphin/e_sqrt.s
@ -1,170 +0,0 @@
-.include "macros.inc"
-.section .sdata2, "a"     # 0x80516360 - 0x80520E40
-.balign 8
-.global lbl_805175E8
-lbl_805175E8:
-	.4byte 0x3FF00000
-	.4byte 0x00000000
-
-.section .text, "ax"  # 0x800056C0 - 0x80472F00
-.global __ieee754_sqrt
-__ieee754_sqrt:
-/* 800CFA2C 000CC96C  94 21 FF E0 */	stwu r1, -0x20(r1)
-/* 800CFA30 000CC970  D8 21 00 08 */	stfd f1, 8(r1)
-/* 800CFA34 000CC974  80 C1 00 08 */	lwz r6, 8(r1)
-/* 800CFA38 000CC978  80 01 00 0C */	lwz r0, 0xc(r1)
-/* 800CFA3C 000CC97C  54 C3 00 56 */	rlwinm r3, r6, 0, 1, 0xb
-/* 800CFA40 000CC980  3C 63 80 10 */	addis r3, r3, 0x8010
-/* 800CFA44 000CC984  28 03 00 00 */	cmplwi r3, 0
-/* 800CFA48 000CC988  40 82 00 14 */	bne lbl_800CFA5C
-/* 800CFA4C 000CC98C  FC 21 08 7A */	fmadd f1, f1, f1, f1
-/* 800CFA50 000CC990  38 00 00 21 */	li r0, 0x21
-/* 800CFA54 000CC994  90 0D 8C C0 */	stw r0, errno@sda21(r13)
-/* 800CFA58 000CC998  48 00 01 F0 */	b lbl_800CFC48
-lbl_800CFA5C:
-/* 800CFA5C 000CC99C  2C 06 00 00 */	cmpwi r6, 0
-/* 800CFA60 000CC9A0  41 81 00 30 */	bgt lbl_800CFA90
-/* 800CFA64 000CC9A4  54 C3 00 7E */	clrlwi r3, r6, 1
-/* 800CFA68 000CC9A8  7C 03 1B 79 */	or. r3, r0, r3
-/* 800CFA6C 000CC9AC  40 82 00 08 */	bne lbl_800CFA74
-/* 800CFA70 000CC9B0  48 00 01 D8 */	b lbl_800CFC48
-lbl_800CFA74:
-/* 800CFA74 000CC9B4  2C 06 00 00 */	cmpwi r6, 0
-/* 800CFA78 000CC9B8  40 80 00 18 */	bge lbl_800CFA90
-/* 800CFA7C 000CC9BC  3C 60 80 51 */	lis r3, __float_nan@ha
-/* 800CFA80 000CC9C0  38 00 00 21 */	li r0, 0x21
-/* 800CFA84 000CC9C4  90 0D 8C C0 */	stw r0, errno@sda21(r13)
-/* 800CFA88 000CC9C8  C0 23 48 B0 */	lfs f1, __float_nan@l(r3)
-/* 800CFA8C 000CC9CC  48 00 01 BC */	b lbl_800CFC48
-lbl_800CFA90:
-/* 800CFA90 000CC9D0  7C C3 A6 71 */	srawi. r3, r6, 0x14
-/* 800CFA94 000CC9D4  40 82 00 50 */	bne lbl_800CFAE4
-/* 800CFA98 000CC9D8  48 00 00 14 */	b lbl_800CFAAC
-lbl_800CFA9C:
-/* 800CFA9C 000CC9DC  54 04 AA FE */	srwi r4, r0, 0xb
-/* 800CFAA0 000CC9E0  54 00 A8 14 */	slwi r0, r0, 0x15
-/* 800CFAA4 000CC9E4  7C C6 23 78 */	or r6, r6, r4
-/* 800CFAA8 000CC9E8  38 63 FF EB */	addi r3, r3, -21
-lbl_800CFAAC:
-/* 800CFAAC 000CC9EC  2C 06 00 00 */	cmpwi r6, 0
-/* 800CFAB0 000CC9F0  41 82 FF EC */	beq lbl_800CFA9C
-/* 800CFAB4 000CC9F4  38 E0 00 00 */	li r7, 0
-/* 800CFAB8 000CC9F8  48 00 00 0C */	b lbl_800CFAC4
-lbl_800CFABC:
-/* 800CFABC 000CC9FC  54 C6 08 3C */	slwi r6, r6, 1
-/* 800CFAC0 000CCA00  38 E7 00 01 */	addi r7, r7, 1
-lbl_800CFAC4:
-/* 800CFAC4 000CCA04  54 C4 02 D7 */	rlwinm. r4, r6, 0, 0xb, 0xb
-/* 800CFAC8 000CCA08  41 82 FF F4 */	beq lbl_800CFABC
-/* 800CFACC 000CCA0C  20 87 00 20 */	subfic r4, r7, 0x20
-/* 800CFAD0 000CCA10  38 A7 FF FF */	addi r5, r7, -1
-/* 800CFAD4 000CCA14  7C 04 24 30 */	srw r4, r0, r4
-/* 800CFAD8 000CCA18  7C 00 38 30 */	slw r0, r0, r7
-/* 800CFADC 000CCA1C  7C 65 18 50 */	subf r3, r5, r3
-/* 800CFAE0 000CCA20  7C C6 23 78 */	or r6, r6, r4
-lbl_800CFAE4:
-/* 800CFAE4 000CCA24  38 83 FC 01 */	addi r4, r3, -1023
-/* 800CFAE8 000CCA28  54 C5 03 3E */	clrlwi r5, r6, 0xc
-/* 800CFAEC 000CCA2C  54 84 07 FF */	clrlwi. r4, r4, 0x1f
-/* 800CFAF0 000CCA30  64 A5 00 10 */	oris r5, r5, 0x10
-/* 800CFAF4 000CCA34  41 82 00 14 */	beq lbl_800CFB08
-/* 800CFAF8 000CCA38  54 04 0F FE */	srwi r4, r0, 0x1f
-/* 800CFAFC 000CCA3C  7C 00 02 14 */	add r0, r0, r0
-/* 800CFB00 000CCA40  7C 84 2A 14 */	add r4, r4, r5
-/* 800CFB04 000CCA44  7C A5 22 14 */	add r5, r5, r4
-lbl_800CFB08:
-/* 800CFB08 000CCA48  54 04 0F FE */	srwi r4, r0, 0x1f
-/* 800CFB0C 000CCA4C  7C 00 02 14 */	add r0, r0, r0
-/* 800CFB10 000CCA50  7C 84 2A 14 */	add r4, r4, r5
-/* 800CFB14 000CCA54  39 20 00 00 */	li r9, 0
-/* 800CFB18 000CCA58  7C A5 22 14 */	add r5, r5, r4
-/* 800CFB1C 000CCA5C  39 60 00 00 */	li r11, 0
-/* 800CFB20 000CCA60  39 40 00 00 */	li r10, 0
-/* 800CFB24 000CCA64  39 80 00 00 */	li r12, 0
-/* 800CFB28 000CCA68  3C C0 00 20 */	lis r6, 0x20
-/* 800CFB2C 000CCA6C  48 00 00 30 */	b lbl_800CFB5C
-lbl_800CFB30:
-/* 800CFB30 000CCA70  7C 8B 32 14 */	add r4, r11, r6
-/* 800CFB34 000CCA74  7C 04 28 00 */	cmpw r4, r5
-/* 800CFB38 000CCA78  41 81 00 10 */	bgt lbl_800CFB48
-/* 800CFB3C 000CCA7C  7D 64 32 14 */	add r11, r4, r6
-/* 800CFB40 000CCA80  7C A4 28 50 */	subf r5, r4, r5
-/* 800CFB44 000CCA84  7D 8C 32 14 */	add r12, r12, r6
-lbl_800CFB48:
-/* 800CFB48 000CCA88  54 04 0F FE */	srwi r4, r0, 0x1f
-/* 800CFB4C 000CCA8C  7C 00 02 14 */	add r0, r0, r0
-/* 800CFB50 000CCA90  7C 84 2A 14 */	add r4, r4, r5
-/* 800CFB54 000CCA94  54 C6 F8 7E */	srwi r6, r6, 1
-/* 800CFB58 000CCA98  7C A5 22 14 */	add r5, r5, r4
-lbl_800CFB5C:
-/* 800CFB5C 000CCA9C  28 06 00 00 */	cmplwi r6, 0
-/* 800CFB60 000CCAA0  40 82 FF D0 */	bne lbl_800CFB30
-/* 800CFB64 000CCAA4  3C C0 80 00 */	lis r6, 0x8000
-/* 800CFB68 000CCAA8  48 00 00 6C */	b lbl_800CFBD4
-lbl_800CFB6C:
-/* 800CFB6C 000CCAAC  7C 0B 28 00 */	cmpw r11, r5
-/* 800CFB70 000CCAB0  7D 67 5B 78 */	mr r7, r11
-/* 800CFB74 000CCAB4  7D 09 32 14 */	add r8, r9, r6
-/* 800CFB78 000CCAB8  41 80 00 10 */	blt lbl_800CFB88
-/* 800CFB7C 000CCABC  40 82 00 44 */	bne lbl_800CFBC0
-/* 800CFB80 000CCAC0  7C 08 00 40 */	cmplw r8, r0
-/* 800CFB84 000CCAC4  41 81 00 3C */	bgt lbl_800CFBC0
-lbl_800CFB88:
-/* 800CFB88 000CCAC8  55 04 00 00 */	rlwinm r4, r8, 0, 0, 0
-/* 800CFB8C 000CCACC  7D 28 32 14 */	add r9, r8, r6
-/* 800CFB90 000CCAD0  3C 84 80 00 */	addis r4, r4, 0x8000
-/* 800CFB94 000CCAD4  28 04 00 00 */	cmplwi r4, 0
-/* 800CFB98 000CCAD8  40 82 00 10 */	bne lbl_800CFBA8
-/* 800CFB9C 000CCADC  55 24 00 01 */	rlwinm. r4, r9, 0, 0, 0
-/* 800CFBA0 000CCAE0  40 82 00 08 */	bne lbl_800CFBA8
-/* 800CFBA4 000CCAE4  39 6B 00 01 */	addi r11, r11, 1
-lbl_800CFBA8:
-/* 800CFBA8 000CCAE8  7C 00 40 40 */	cmplw r0, r8
-/* 800CFBAC 000CCAEC  7C A7 28 50 */	subf r5, r7, r5
-/* 800CFBB0 000CCAF0  40 80 00 08 */	bge lbl_800CFBB8
-/* 800CFBB4 000CCAF4  38 A5 FF FF */	addi r5, r5, -1
-lbl_800CFBB8:
-/* 800CFBB8 000CCAF8  7C 08 00 50 */	subf r0, r8, r0
-/* 800CFBBC 000CCAFC  7D 4A 32 14 */	add r10, r10, r6
-lbl_800CFBC0:
-/* 800CFBC0 000CCB00  54 04 0F FE */	srwi r4, r0, 0x1f
-/* 800CFBC4 000CCB04  7C 00 02 14 */	add r0, r0, r0
-/* 800CFBC8 000CCB08  7C 84 2A 14 */	add r4, r4, r5
-/* 800CFBCC 000CCB0C  54 C6 F8 7E */	srwi r6, r6, 1
-/* 800CFBD0 000CCB10  7C A5 22 14 */	add r5, r5, r4
-lbl_800CFBD4:
-/* 800CFBD4 000CCB14  28 06 00 00 */	cmplwi r6, 0
-/* 800CFBD8 000CCB18  40 82 FF 94 */	bne lbl_800CFB6C
-/* 800CFBDC 000CCB1C  7C A0 03 79 */	or. r0, r5, r0
-/* 800CFBE0 000CCB20  41 82 00 30 */	beq lbl_800CFC10
-/* 800CFBE4 000CCB24  C8 02 92 88 */	lfd f0, lbl_805175E8@sda21(r2)
-/* 800CFBE8 000CCB28  3C 0A 00 01 */	addis r0, r10, 1
-/* 800CFBEC 000CCB2C  28 00 FF FF */	cmplwi r0, 0xffff
-/* 800CFBF0 000CCB30  D8 01 00 10 */	stfd f0, 0x10(r1)
-/* 800CFBF4 000CCB34  D8 01 00 10 */	stfd f0, 0x10(r1)
-/* 800CFBF8 000CCB38  40 82 00 10 */	bne lbl_800CFC08
-/* 800CFBFC 000CCB3C  39 40 00 00 */	li r10, 0
-/* 800CFC00 000CCB40  39 8C 00 01 */	addi r12, r12, 1
-/* 800CFC04 000CCB44  48 00 00 0C */	b lbl_800CFC10
-lbl_800CFC08:
-/* 800CFC08 000CCB48  55 40 07 FE */	clrlwi r0, r10, 0x1f
-/* 800CFC0C 000CCB4C  7D 4A 02 14 */	add r10, r10, r0
-lbl_800CFC10:
-/* 800CFC10 000CCB50  55 80 07 FE */	clrlwi r0, r12, 0x1f
-/* 800CFC14 000CCB54  7D 84 0E 70 */	srawi r4, r12, 1
-/* 800CFC18 000CCB58  2C 00 00 01 */	cmpwi r0, 1
-/* 800CFC1C 000CCB5C  55 45 F8 7E */	srwi r5, r10, 1
-/* 800CFC20 000CCB60  3C 84 3F E0 */	addis r4, r4, 0x3fe0
-/* 800CFC24 000CCB64  40 82 00 08 */	bne lbl_800CFC2C
-/* 800CFC28 000CCB68  64 A5 80 00 */	oris r5, r5, 0x8000
-lbl_800CFC2C:
-/* 800CFC2C 000CCB6C  38 03 FC 01 */	addi r0, r3, -1023
-/* 800CFC30 000CCB70  90 A1 00 14 */	stw r5, 0x14(r1)
-/* 800CFC34 000CCB74  7C 00 0E 70 */	srawi r0, r0, 1
-/* 800CFC38 000CCB78  54 00 A0 16 */	slwi r0, r0, 0x14
-/* 800CFC3C 000CCB7C  7C 84 02 14 */	add r4, r4, r0
-/* 800CFC40 000CCB80  90 81 00 10 */	stw r4, 0x10(r1)
-/* 800CFC44 000CCB84  C8 21 00 10 */	lfd f1, 0x10(r1)
-lbl_800CFC48:
-/* 800CFC48 000CCB88  38 21 00 20 */	addi r1, r1, 0x20
-/* 800CFC4C 000CCB8C  4E 80 00 20 */	blr 
--- a/obj_files.mk
+++ b/obj_files.mk
@ -306,7 +306,7 @@ DOLPHIN:=\
 	$(BUILD_DIR)/asm/Dolphin/strtoul.o\
 	$(BUILD_DIR)/src/Dolphin/wchar_io.o\
 	$(BUILD_DIR)/src/Dolphin/uart_console_io_gcn.o\
-	$(BUILD_DIR)/asm/Dolphin/e_asin.o\
+	$(BUILD_DIR)/src/Dolphin/e_asin.o\
 	$(BUILD_DIR)/src/Dolphin/e_atan2.o\
 	$(BUILD_DIR)/src/Dolphin/e_exp.o\
 	$(BUILD_DIR)/src/Dolphin/e_fmod.o\
@ -334,7 +334,7 @@ DOLPHIN:=\
 	$(BUILD_DIR)/src/Dolphin/w_fmod.o\
 	$(BUILD_DIR)/src/Dolphin/w_log10.o\
 	$(BUILD_DIR)/src/Dolphin/w_pow.o\
-	$(BUILD_DIR)/asm/Dolphin/e_sqrt.o\
+	$(BUILD_DIR)/src/Dolphin/e_sqrt.o\
 	$(BUILD_DIR)/src/Dolphin/math_ppc.o\
 	$(BUILD_DIR)/src/Dolphin/w_sqrt.o\
 	$(BUILD_DIR)/asm/Dolphin/extras.o\
--- a/src/Dolphin/e_asin.c
+++ b/src/Dolphin/e_asin.c
@ -42,6 +42,8 @@
 */

 #include "fdlibm.h"
+#include "Dolphin/float.h"
+#include "Dolphin/math.h"

 #ifdef __STDC__
 static const double
@ -79,7 +81,7 @@ double __ieee754_asin(x) double x;
 		if (((ix - 0x3ff00000) | __LO(x)) == 0)
 			/* asin(1)=+-pi/2 with inexact */
 			return x * pio2_hi + x * pio2_lo;
-		return (x - x) / (x - x); /* asin(|x|>1) is NaN */
+		return __float_nan;       /* asin(|x|>1) is NaN */
 	} else if (ix < 0x3fe00000) { /* |x|<0.5 */
 		if (ix < 0x3e400000) {    /* if |x| < 2**-27 */
 			if (huge + x > one)
@ -92,7 +94,7 @@ double __ieee754_asin(x) double x;
 		return x + x * w;
 	}
 	/* 1> |x|>= 0.5 */
-	w = one - __fabs(x);
+	w = one - fabs(x);
 	t = w * 0.5;
 	p = t * (pS0 + t * (pS1 + t * (pS2 + t * (pS3 + t * (pS4 + t * pS5)))));
 	q = one + t * (qS1 + t * (qS2 + t * (qS3 + t * qS4)));
--- a/src/Dolphin/e_sqrt.c
+++ b/src/Dolphin/e_sqrt.c
@ -1,194 +1,468 @@
-
-
 /*
 * --INFO--
 * Address:	800CFA2C
 * Size:	000224
 */
-void __ieee754_sqrt(void)
+
+/* @(#)e_sqrt.c 1.3 95/01/18 */
+/*
+ * ====================================================
+ * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+ *
+ * Developed at SunSoft, a Sun Microsystems, Inc. business.
+ * Permission to use, copy, modify, and distribute this
+ * software is freely granted, provided that this notice
+ * is preserved.
+ * ====================================================
+ */
+
+/* __ieee754_sqrt(x)
+ * Return correctly rounded sqrt.
+ *           ------------------------------------------
+ *	     |  Use the hardware sqrt if you have one |
+ *           ------------------------------------------
+ * Method:
+ *   Bit by bit method using integer arithmetic. (Slow, but portable)
+ *   1. Normalization
+ *	Scale x to y in [1,4) with even powers of 2:
+ *	find an integer k such that  1 <= (y=x*2^(2k)) < 4, then
+ *		sqrt(x) = 2^k * sqrt(y)
+ *   2. Bit by bit computation
+ *	Let q  = sqrt(y) truncated to i bit after binary point (q = 1),
+ *	     i							 0
+ *                                     i+1         2
+ *	    s  = 2*q , and	y  =  2   * ( y - q  ).		(1)
+ *	     i      i            i                 i
+ *
+ *	To compute q    from q , one checks whether
+ *		    i+1       i
+ *
+ *			      -(i+1) 2
+ *			(q + 2      ) <= y.			(2)
+ *     			  i
+ *							      -(i+1)
+ *	If (2) is false, then q   = q ; otherwise q   = q  + 2      .
+ *		 	       i+1   i             i+1   i
+ *
+ *	With some algebric manipulation, it is not difficult to see
+ *	that (2) is equivalent to
+ *                             -(i+1)
+ *			s  +  2       <= y			(3)
+ *			 i                i
+ *
+ *	The advantage of (3) is that s  and y  can be computed by
+ *				      i      i
+ *	the following recurrence formula:
+ *	    if (3) is false
+ *
+ *	    s     =  s  ,	y    = y   ;			(4)
+ *	     i+1      i		 i+1    i
+ *
+ *	    otherwise,
+ *                         -i                     -(i+1)
+ *	    s	  =  s  + 2  ,  y    = y  -  s  - 2  		(5)
+ *           i+1      i          i+1    i     i
+ *
+ *	One may easily use induction to prove (4) and (5).
+ *	Note. Since the left hand side of (3) contain only i+2 bits,
+ *	      it does not necessary to do a full (53-bit) comparison
+ *	      in (3).
+ *   3. Final rounding
+ *	After generating the 53 bits result, we compute one more bit.
+ *	Together with the remainder, we can decide whether the
+ *	result is exact, bigger than 1/2ulp, or less than 1/2ulp
+ *	(it will never equal to 1/2ulp).
+ *	The rounding mode can be detected by checking whether
+ *	huge + tiny is equal to huge, and whether huge - tiny is
+ *	equal to huge for some floating point number "huge" and "tiny".
+ *
+ * Special cases:
+ *	sqrt(+-0) = +-0 	... exact
+ *	sqrt(inf) = inf
+ *	sqrt(-ve) = NaN		... with invalid signal
+ *	sqrt(NaN) = NaN		... with invalid signal for signaling NaN
+ *
+ * Other methods : see the appended file at the end of the program below.
+ *---------------
+ */
+
+#include "fdlibm.h"
+#include "errno.h"
+#include "Dolphin/float.h"
+
+#ifdef __STDC__
+static const double one = 1.0, tiny = 1.0e-300;
+#else
+static double one = 1.0, tiny = 1.0e-300;
+#endif
+
+#ifdef __STDC__
+double __ieee754_sqrt(double x)
+#else
+double __ieee754_sqrt(x) double x;
+#endif
 {
-	/*
-	.loc_0x0:
-	  stwu      r1, -0x20(r1)
-	  stfd      f1, 0x8(r1)
-	  lwz       r6, 0x8(r1)
-	  lwz       r0, 0xC(r1)
-	  rlwinm    r3,r6,0,1,11
-	  subis     r3, r3, 0x7FF0
-	  cmplwi    r3, 0
-	  bne-      .loc_0x30
-	  fmadd     f1, f1, f1, f1
-	  li        r0, 0x21
-	  stw       r0, -0x7340(r13)
-	  b         .loc_0x21C
+	double z;
+	int sign = (int)0x80000000;
+	unsigned r, t1, s1, ix1, q1;
+	int ix0, s0, q, m, t, i;

-	.loc_0x30:
-	  cmpwi     r6, 0
-	  bgt-      .loc_0x64
-	  rlwinm    r3,r6,0,1,31
-	  or.       r3, r0, r3
-	  bne-      .loc_0x48
-	  b         .loc_0x21C
+	ix0 = __HI(x); /* high word of x */
+	ix1 = __LO(x); /* low word of x */

-	.loc_0x48:
-	  cmpwi     r6, 0
-	  bge-      .loc_0x64
-	  lis       r3, 0x8051
-	  li        r0, 0x21
-	  stw       r0, -0x7340(r13)
-	  lfs       f1, 0x48B0(r3)
-	  b         .loc_0x21C
+	/* take care of Inf and NaN */
+	if ((ix0 & 0x7ff00000) == 0x7ff00000) {
+		errno = 33;
+		return x * x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf
+		             sqrt(-inf)=sNaN */
+	}
+	/* take care of zero */
+	if (ix0 <= 0) {
+		if (((ix0 & (~sign)) | ix1) == 0)
+			return x; /* sqrt(+-0) = +-0 */
+		else if (ix0 < 0) {
+			errno = 33;
+			return __float_nan;
+		} /* sqrt(-ve) = sNaN */
+	}
+	/* normalize x */
+	m = (ix0 >> 20);
+	if (m == 0) { /* subnormal x */
+		while (ix0 == 0) {
+			m -= 21;
+			ix0 |= (ix1 >> 11);
+			ix1 <<= 21;
+		}
+		for (i = 0; (ix0 & 0x00100000) == 0; i++)
+			ix0 <<= 1;
+		m -= i - 1;
+		ix0 |= (ix1 >> (32 - i));
+		ix1 <<= i;
+	}
+	m -= 1023; /* unbias exponent */
+	ix0 = (ix0 & 0x000fffff) | 0x00100000;
+	if (m & 1) { /* odd m, double x to make it even */
+		ix0 += ix0 + ((ix1 & sign) >> 31);
+		ix1 += ix1;
+	}
+	m >>= 1; /* m = [m/2] */

-	.loc_0x64:
-	  srawi.    r3, r6, 0x14
-	  bne-      .loc_0xB8
-	  b         .loc_0x80
+	/* generate sqrt(x) bit by bit */
+	ix0 += ix0 + ((ix1 & sign) >> 31);
+	ix1 += ix1;
+	q = q1 = s0 = s1 = 0;          /* [q,q1] = sqrt(x) */
+	r                = 0x00200000; /* r = moving bit from right to left */

-	.loc_0x70:
-	  rlwinm    r4,r0,21,11,31
-	  rlwinm    r0,r0,21,0,10
-	  or        r6, r6, r4
-	  subi      r3, r3, 0x15
+	while (r != 0) {
+		t = s0 + r;
+		if (t <= ix0) {
+			s0 = t + r;
+			ix0 -= t;
+			q += r;
+		}
+		ix0 += ix0 + ((ix1 & sign) >> 31);
+		ix1 += ix1;
+		r >>= 1;
+	}

-	.loc_0x80:
-	  cmpwi     r6, 0
-	  beq+      .loc_0x70
-	  li        r7, 0
-	  b         .loc_0x98
+	r = sign;
+	while (r != 0) {
+		t1 = s1 + r;
+		t  = s0;
+		if ((t < ix0) || ((t == ix0) && (t1 <= ix1))) {
+			s1 = t1 + r;
+			if (((t1 & sign) == sign) && (s1 & sign) == 0)
+				s0 += 1;
+			ix0 -= t;
+			if (ix1 < t1)
+				ix0 -= 1;
+			ix1 -= t1;
+			q1 += r;
+		}
+		ix0 += ix0 + ((ix1 & sign) >> 31);
+		ix1 += ix1;
+		r >>= 1;
+	}

-	.loc_0x90:
-	  rlwinm    r6,r6,1,0,30
-	  addi      r7, r7, 0x1
-
-	.loc_0x98:
-	  rlwinm.   r4,r6,0,11,11
-	  beq+      .loc_0x90
-	  subfic    r4, r7, 0x20
-	  subi      r5, r7, 0x1
-	  srw       r4, r0, r4
-	  slw       r0, r0, r7
-	  sub       r3, r3, r5
-	  or        r6, r6, r4
-
-	.loc_0xB8:
-	  subi      r4, r3, 0x3FF
-	  rlwinm    r5,r6,0,12,31
-	  rlwinm.   r4,r4,0,31,31
-	  oris      r5, r5, 0x10
-	  beq-      .loc_0xDC
-	  rlwinm    r4,r0,1,31,31
-	  add       r0, r0, r0
-	  add       r4, r4, r5
-	  add       r5, r5, r4
-
-	.loc_0xDC:
-	  rlwinm    r4,r0,1,31,31
-	  add       r0, r0, r0
-	  add       r4, r4, r5
-	  li        r9, 0
-	  add       r5, r5, r4
-	  li        r11, 0
-	  li        r10, 0
-	  li        r12, 0
-	  lis       r6, 0x20
-	  b         .loc_0x130
-
-	.loc_0x104:
-	  add       r4, r11, r6
-	  cmpw      r4, r5
-	  bgt-      .loc_0x11C
-	  add       r11, r4, r6
-	  sub       r5, r5, r4
-	  add       r12, r12, r6
-
-	.loc_0x11C:
-	  rlwinm    r4,r0,1,31,31
-	  add       r0, r0, r0
-	  add       r4, r4, r5
-	  rlwinm    r6,r6,31,1,31
-	  add       r5, r5, r4
-
-	.loc_0x130:
-	  cmplwi    r6, 0
-	  bne+      .loc_0x104
-	  lis       r6, 0x8000
-	  b         .loc_0x1A8
-
-	.loc_0x140:
-	  cmpw      r11, r5
-	  mr        r7, r11
-	  add       r8, r9, r6
-	  blt-      .loc_0x15C
-	  bne-      .loc_0x194
-	  cmplw     r8, r0
-	  bgt-      .loc_0x194
-
-	.loc_0x15C:
-	  rlwinm    r4,r8,0,0,0
-	  add       r9, r8, r6
-	  subis     r4, r4, 0x8000
-	  cmplwi    r4, 0
-	  bne-      .loc_0x17C
-	  rlwinm.   r4,r9,0,0,0
-	  bne-      .loc_0x17C
-	  addi      r11, r11, 0x1
-
-	.loc_0x17C:
-	  cmplw     r0, r8
-	  sub       r5, r5, r7
-	  bge-      .loc_0x18C
-	  subi      r5, r5, 0x1
-
-	.loc_0x18C:
-	  sub       r0, r0, r8
-	  add       r10, r10, r6
-
-	.loc_0x194:
-	  rlwinm    r4,r0,1,31,31
-	  add       r0, r0, r0
-	  add       r4, r4, r5
-	  rlwinm    r6,r6,31,1,31
-	  add       r5, r5, r4
-
-	.loc_0x1A8:
-	  cmplwi    r6, 0
-	  bne+      .loc_0x140
-	  or.       r0, r5, r0
-	  beq-      .loc_0x1E4
-	  lfd       f0, -0x6D78(r2)
-	  addis     r0, r10, 0x1
-	  cmplwi    r0, 0xFFFF
-	  stfd      f0, 0x10(r1)
-	  stfd      f0, 0x10(r1)
-	  bne-      .loc_0x1DC
-	  li        r10, 0
-	  addi      r12, r12, 0x1
-	  b         .loc_0x1E4
-
-	.loc_0x1DC:
-	  rlwinm    r0,r10,0,31,31
-	  add       r10, r10, r0
-
-	.loc_0x1E4:
-	  rlwinm    r0,r12,0,31,31
-	  srawi     r4, r12, 0x1
-	  cmpwi     r0, 0x1
-	  rlwinm    r5,r10,31,1,31
-	  addis     r4, r4, 0x3FE0
-	  bne-      .loc_0x200
-	  oris      r5, r5, 0x8000
-
-	.loc_0x200:
-	  subi      r0, r3, 0x3FF
-	  stw       r5, 0x14(r1)
-	  srawi     r0, r0, 0x1
-	  rlwinm    r0,r0,20,0,11
-	  add       r4, r4, r0
-	  stw       r4, 0x10(r1)
-	  lfd       f1, 0x10(r1)
-
-	.loc_0x21C:
-	  addi      r1, r1, 0x20
-	  blr
-	*/
+	/* use floating add to find out rounding direction */
+	if ((ix0 | ix1) != 0) {
+		z = one - tiny; /* trigger inexact flag */
+		if (z >= one) {
+			z = one + tiny;
+			if (q1 == (unsigned)0xffffffff) {
+				q1 = 0;
+				q += 1;
+			} else if (z > one) {
+				if (q1 == (unsigned)0xfffffffe)
+					q += 1;
+				q1 += 2;
+			} else
+				q1 += (q1 & 1);
+		}
+	}
+	ix0 = (q >> 1) + 0x3fe00000;
+	ix1 = q1 >> 1;
+	if ((q & 1) == 1)
+		ix1 |= sign;
+	ix0 += (m << 20);
+	__HI(z) = ix0;
+	__LO(z) = ix1;
+	return z;
 }
+
+/*
+Other methods  (use floating-point arithmetic)
+-------------
+(This is a copy of a drafted paper by Prof W. Kahan
+and K.C. Ng, written in May, 1986)
+
+    Two algorithms are given here to implement sqrt(x)
+    (IEEE double precision arithmetic) in software.
+    Both supply sqrt(x) correctly rounded. The first algorithm (in
+    Section A) uses newton iterations and involves four divisions.
+    The second one uses reciproot iterations to avoid division, but
+    requires more multiplications. Both algorithms need the ability
+    to chop results of arithmetic operations instead of round them,
+    and the INEXACT flag to indicate when an arithmetic operation
+    is executed exactly with no roundoff error, all part of the
+    standard (IEEE 754-1985). The ability to perform shift, add,
+    subtract and logical AND operations upon 32-bit words is needed
+    too, though not part of the standard.
+
+A.  sqrt(x) by Newton Iteration
+
+   (1)	Initial approximation
+
+    Let x0 and x1 be the leading and the trailing 32-bit words of
+    a floating point number x (in IEEE double format) respectively
+
+        1    11		     52				  ...widths
+       ------------------------------------------------------
+    x: |s|	  e     |	      f				|
+       ------------------------------------------------------
+          msb    lsb  msb				      lsb ...order
+
+
+         ------------------------  	     ------------------------
+    x0:  |s|   e    |    f1     |	 x1: |          f2           |
+         ------------------------  	     ------------------------
+
+    By performing shifts and subtracts on x0 and x1 (both regarded
+    as integers), we obtain an 8-bit approximation of sqrt(x) as
+    follows.
+
+        k  := (x0>>1) + 0x1ff80000;
+        y0 := k - T1[31&(k>>15)].	... y ~ sqrt(x) to 8 bits
+    Here k is a 32-bit integer and T1[] is an integer array containing
+    correction terms. Now magically the floating value of y (y's
+    leading 32-bit word is y0, the value of its trailing word is 0)
+    approximates sqrt(x) to almost 8-bit.
+
+    Value of T1:
+    static int T1[32]= {
+    0,	1024,	3062,	5746,	9193,	13348,	18162,	23592,
+    29598,	36145,	43202,	50740,	58733,	67158,	75992,	85215,
+    83599,	71378,	60428,	50647,	41945,	34246,	27478,	21581,
+    16499,	12183,	8588,	5674,	3403,	1742,	661,	130,};
+
+    (2)	Iterative refinement
+
+    Apply Heron's rule three times to y, we have y approximates
+    sqrt(x) to within 1 ulp (Unit in the Last Place):
+
+        y := (y+x/y)/2		... almost 17 sig. bits
+        y := (y+x/y)/2		... almost 35 sig. bits
+        y := y-(y-x/y)/2	... within 1 ulp
+
+
+    Remark 1.
+        Another way to improve y to within 1 ulp is:
+
+        y := (y+x/y)		... almost 17 sig. bits to 2*sqrt(x)
+        y := y - 0x00100006	... almost 18 sig. bits to sqrt(x)
+
+                2
+                (x-y )*y
+        y := y + 2* ----------	...within 1 ulp
+                   2
+                 3y  + x
+
+
+    This formula has one division fewer than the one above; however,
+    it requires more multiplications and additions. Also x must be
+    scaled in advance to avoid spurious overflow in evaluating the
+    expression 3y*y+x. Hence it is not recommended uless division
+    is slow. If division is very slow, then one should use the
+    reciproot algorithm given in section B.
+
+    (3) Final adjustment
+
+    By twiddling y's last bit it is possible to force y to be
+    correctly rounded according to the prevailing rounding mode
+    as follows. Let r and i be copies of the rounding mode and
+    inexact flag before entering the square root program. Also we
+    use the expression y+-ulp for the next representable floating
+    numbers (up and down) of y. Note that y+-ulp = either fixed
+    point y+-1, or multiply y by nextafter(1,+-inf) in chopped
+    mode.
+
+        I := FALSE;	... reset INEXACT flag I
+        R := RZ;	... set rounding mode to round-toward-zero
+        z := x/y;	... chopped quotient, possibly inexact
+        If(not I) then {	... if the quotient is exact
+            if(z=y) {
+                I := i;	 ... restore inexact flag
+                R := r;  ... restore rounded mode
+                return sqrt(x):=y.
+            } else {
+            z := z - ulp;	... special rounding
+            }
+        }
+        i := TRUE;		... sqrt(x) is inexact
+        If (r=RN) then z=z+ulp	... rounded-to-nearest
+        If (r=RP) then {	... round-toward-+inf
+            y = y+ulp; z=z+ulp;
+        }
+        y := y+z;		... chopped sum
+        y0:=y0-0x00100000;	... y := y/2 is correctly rounded.
+            I := i;	 		... restore inexact flag
+            R := r;  		... restore rounded mode
+            return sqrt(x):=y.
+
+    (4)	Special cases
+
+    Square root of +inf, +-0, or NaN is itself;
+    Square root of a negative number is NaN with invalid signal.
+
+
+B.  sqrt(x) by Reciproot Iteration
+
+   (1)	Initial approximation
+
+    Let x0 and x1 be the leading and the trailing 32-bit words of
+    a floating point number x (in IEEE double format) respectively
+    (see section A). By performing shifs and subtracts on x0 and y0,
+    we obtain a 7.8-bit approximation of 1/sqrt(x) as follows.
+
+        k := 0x5fe80000 - (x0>>1);
+        y0:= k - T2[63&(k>>14)].	... y ~ 1/sqrt(x) to 7.8 bits
+
+    Here k is a 32-bit integer and T2[] is an integer array
+    containing correction terms. Now magically the floating
+    value of y (y's leading 32-bit word is y0, the value of
+    its trailing word y1 is set to zero) approximates 1/sqrt(x)
+    to almost 7.8-bit.
+
+    Value of T2:
+    static int T2[64]= {
+    0x1500,	0x2ef8,	0x4d67,	0x6b02,	0x87be,	0xa395,	0xbe7a,	0xd866,
+    0xf14a,	0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,
+    0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,
+    0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,
+    0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,
+    0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,
+    0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,
+    0x1527f,0x1334a,0x11051,0xe951,	0xbe01,	0x8e0d,	0x5924,	0x1edd,};
+
+    (2)	Iterative refinement
+
+    Apply Reciproot iteration three times to y and multiply the
+    result by x to get an approximation z that matches sqrt(x)
+    to about 1 ulp. To be exact, we will have
+        -1ulp < sqrt(x)-z<1.0625ulp.
+
+    ... set rounding mode to Round-to-nearest
+       y := y*(1.5-0.5*x*y*y)	... almost 15 sig. bits to 1/sqrt(x)
+       y := y*((1.5-2^-30)+0.5*x*y*y)... about 29 sig. bits to 1/sqrt(x)
+    ... special arrangement for better accuracy
+       z := x*y			... 29 bits to sqrt(x), with z*y<1
+       z := z + 0.5*z*(1-z*y)	... about 1 ulp to sqrt(x)
+
+    Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that
+    (a) the term z*y in the final iteration is always less than 1;
+    (b) the error in the final result is biased upward so that
+        -1 ulp < sqrt(x) - z < 1.0625 ulp
+        instead of |sqrt(x)-z|<1.03125ulp.
+
+    (3)	Final adjustment
+
+    By twiddling y's last bit it is possible to force y to be
+    correctly rounded according to the prevailing rounding mode
+    as follows. Let r and i be copies of the rounding mode and
+    inexact flag before entering the square root program. Also we
+    use the expression y+-ulp for the next representable floating
+    numbers (up and down) of y. Note that y+-ulp = either fixed
+    point y+-1, or multiply y by nextafter(1,+-inf) in chopped
+    mode.
+
+    R := RZ;		... set rounding mode to round-toward-zero
+    switch(r) {
+        case RN:		... round-to-nearest
+           if(x<= z*(z-ulp)...chopped) z = z - ulp; else
+           if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp;
+           break;
+        case RZ:case RM:	... round-to-zero or round-to--inf
+           R:=RP;		... reset rounding mod to round-to-+inf
+           if(x<z*z ... rounded up) z = z - ulp; else
+           if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp;
+           break;
+        case RP:		... round-to-+inf
+           if(x>(z+ulp)*(z+ulp)...chopped) z = z+2*ulp; else
+           if(x>z*z ...chopped) z = z+ulp;
+           break;
+    }
+
+    Remark 3. The above comparisons can be done in fixed point. For
+    example, to compare x and w=z*z chopped, it suffices to compare
+    x1 and w1 (the trailing parts of x and w), regarding them as
+    two's complement integers.
+
+    ...Is z an exact square root?
+    To determine whether z is an exact square root of x, let z1 be the
+    trailing part of z, and also let x0 and x1 be the leading and
+    trailing parts of x.
+
+    If ((z1&0x03ffffff)!=0)	... not exact if trailing 26 bits of z!=0
+        I := 1;		... Raise Inexact flag: z is not exact
+    else {
+        j := 1 - [(x0>>20)&1]	... j = logb(x) mod 2
+        k := z1 >> 26;		... get z's 25-th and 26-th
+                        fraction bits
+        I := i or (k&j) or ((k&(j+j+1))!=(x1&3));
+    }
+    R:= r		... restore rounded mode
+    return sqrt(x):=z.
+
+    If multiplication is cheaper then the foregoing red tape, the
+    Inexact flag can be evaluated by
+
+        I := i;
+        I := (z*z!=x) or I.
+
+    Note that z*z can overwrite I; this value must be sensed if it is
+    True.
+
+    Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be
+    zero.
+
+            --------------------
+        z1: |        f2        |
+            --------------------
+        bit 31		   bit 0
+
+    Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd
+    or even of logb(x) have the following relations:
+
+    -------------------------------------------------
+    bit 27,26 of z1		bit 1,0 of x1	logb(x)
+    -------------------------------------------------
+    00			00		odd and even
+    01			01		even
+    10			10		odd
+    10			00		even
+    11			01		even
+    -------------------------------------------------
+
+    (4)	Special cases (see (4) of Section A).
+
+ */