finish matching fdlibm files

fdlibm complete!
This commit is contained in:
EpochFlame 2022-02-11 15:38:29 -05:00
parent 46774da475
commit 6b9f925f74
5 changed files with 461 additions and 581 deletions

View File

@ -1,226 +0,0 @@
.include "macros.inc"
.section .sdata2, "a" # 0x80516360 - 0x80520E40
.balign 8
.global lbl_805170F0
lbl_805170F0:
.4byte 0x3FF921FB
.4byte 0x54442D18
.global lbl_805170F8
lbl_805170F8:
.4byte 0x3C91A626
.4byte 0x33145C07
.global lbl_80517100
lbl_80517100:
.4byte 0x7E37E43C
.4byte 0x8800759C
.global lbl_80517108
lbl_80517108:
.4byte 0x3FF00000
.4byte 0x00000000
.global lbl_80517110
lbl_80517110:
.4byte 0x3FC55555
.4byte 0x55555555
.global lbl_80517118
lbl_80517118:
.4byte 0xBFD4D612
.4byte 0x03EB6F7D
.global lbl_80517120
lbl_80517120:
.4byte 0x3FC9C155
.4byte 0x0E884455
.global lbl_80517128
lbl_80517128:
.4byte 0xBFA48228
.4byte 0xB5688F3B
.global lbl_80517130
lbl_80517130:
.4byte 0x3F49EFE0
.4byte 0x7501B288
.global lbl_80517138
lbl_80517138:
.4byte 0x3F023DE1
.4byte 0x0DFDF709
.global lbl_80517140
lbl_80517140:
.4byte 0xC0033A27
.4byte 0x1C8A2D4B
.global lbl_80517148
lbl_80517148:
.4byte 0x40002AE5
.4byte 0x9C598AC8
.global lbl_80517150
lbl_80517150:
.4byte 0xBFE6066C
.4byte 0x1B8D0159
.global lbl_80517158
lbl_80517158:
.4byte 0x3FB3B8C5
.4byte 0xB12E9282
.global lbl_80517160
lbl_80517160:
.4byte 0x3FE00000
.4byte 0x00000000
.global lbl_80517168
lbl_80517168:
.4byte 0x40000000
.4byte 0x00000000
.global lbl_80517170
lbl_80517170:
.4byte 0x3FE921FB
.4byte 0x54442D18
.section .text, "ax" # 0x800056C0 - 0x80472F00
.global __ieee754_asin
__ieee754_asin:
/* 800CC3B0 000C92F0 94 21 FF B0 */ stwu r1, -0x50(r1)
/* 800CC3B4 000C92F4 7C 08 02 A6 */ mflr r0
/* 800CC3B8 000C92F8 90 01 00 54 */ stw r0, 0x54(r1)
/* 800CC3BC 000C92FC DB E1 00 40 */ stfd f31, 0x40(r1)
/* 800CC3C0 000C9300 F3 E1 00 48 */ psq_st f31, 72(r1), 0, qr0
/* 800CC3C4 000C9304 DB C1 00 30 */ stfd f30, 0x30(r1)
/* 800CC3C8 000C9308 F3 C1 00 38 */ psq_st f30, 56(r1), 0, qr0
/* 800CC3CC 000C930C DB A1 00 20 */ stfd f29, 0x20(r1)
/* 800CC3D0 000C9310 F3 A1 00 28 */ psq_st f29, 40(r1), 0, qr0
/* 800CC3D4 000C9314 93 E1 00 1C */ stw r31, 0x1c(r1)
/* 800CC3D8 000C9318 93 C1 00 18 */ stw r30, 0x18(r1)
/* 800CC3DC 000C931C D8 21 00 08 */ stfd f1, 8(r1)
/* 800CC3E0 000C9320 3C 00 3F F0 */ lis r0, 0x3ff0
/* 800CC3E4 000C9324 83 E1 00 08 */ lwz r31, 8(r1)
/* 800CC3E8 000C9328 57 FE 00 7E */ clrlwi r30, r31, 1
/* 800CC3EC 000C932C 7C 1E 00 00 */ cmpw r30, r0
/* 800CC3F0 000C9330 41 80 00 34 */ blt lbl_800CC424
/* 800CC3F4 000C9334 80 01 00 0C */ lwz r0, 0xc(r1)
/* 800CC3F8 000C9338 3C 7E C0 10 */ addis r3, r30, 0xc010
/* 800CC3FC 000C933C 7C 60 03 79 */ or. r0, r3, r0
/* 800CC400 000C9340 40 82 00 18 */ bne lbl_800CC418
/* 800CC404 000C9344 C8 02 8D 98 */ lfd f0, lbl_805170F8@sda21(r2)
/* 800CC408 000C9348 C8 42 8D 90 */ lfd f2, lbl_805170F0@sda21(r2)
/* 800CC40C 000C934C FC 00 00 72 */ fmul f0, f0, f1
/* 800CC410 000C9350 FC 22 00 7A */ fmadd f1, f2, f1, f0
/* 800CC414 000C9354 48 00 01 A4 */ b lbl_800CC5B8
lbl_800CC418:
/* 800CC418 000C9358 3C 60 80 51 */ lis r3, __float_nan@ha
/* 800CC41C 000C935C C0 23 48 B0 */ lfs f1, __float_nan@l(r3)
/* 800CC420 000C9360 48 00 01 98 */ b lbl_800CC5B8
lbl_800CC424:
/* 800CC424 000C9364 3C 00 3F E0 */ lis r0, 0x3fe0
/* 800CC428 000C9368 7C 1E 00 00 */ cmpw r30, r0
/* 800CC42C 000C936C 40 80 00 94 */ bge lbl_800CC4C0
/* 800CC430 000C9370 3C 00 3E 40 */ lis r0, 0x3e40
/* 800CC434 000C9374 7C 1E 00 00 */ cmpw r30, r0
/* 800CC438 000C9378 40 80 00 1C */ bge lbl_800CC454
/* 800CC43C 000C937C C8 42 8D A0 */ lfd f2, lbl_80517100@sda21(r2)
/* 800CC440 000C9380 C8 02 8D A8 */ lfd f0, lbl_80517108@sda21(r2)
/* 800CC444 000C9384 FC 42 08 2A */ fadd f2, f2, f1
/* 800CC448 000C9388 FC 02 00 40 */ fcmpo cr0, f2, f0
/* 800CC44C 000C938C 40 81 00 0C */ ble lbl_800CC458
/* 800CC450 000C9390 48 00 01 68 */ b lbl_800CC5B8
lbl_800CC454:
/* 800CC454 000C9394 FF E1 00 72 */ fmul f31, f1, f1
lbl_800CC458:
/* 800CC458 000C9398 C8 22 8D D8 */ lfd f1, lbl_80517138@sda21(r2)
/* 800CC45C 000C939C C8 02 8D D0 */ lfd f0, lbl_80517130@sda21(r2)
/* 800CC460 000C93A0 C8 42 8D C8 */ lfd f2, lbl_80517128@sda21(r2)
/* 800CC464 000C93A4 FC 61 07 FA */ fmadd f3, f1, f31, f0
/* 800CC468 000C93A8 C8 C2 8D C0 */ lfd f6, lbl_80517120@sda21(r2)
/* 800CC46C 000C93AC C8 22 8D F8 */ lfd f1, lbl_80517158@sda21(r2)
/* 800CC470 000C93B0 C8 02 8D F0 */ lfd f0, lbl_80517150@sda21(r2)
/* 800CC474 000C93B4 C8 A2 8D B8 */ lfd f5, lbl_80517118@sda21(r2)
/* 800CC478 000C93B8 FC FF 10 FA */ fmadd f7, f31, f3, f2
/* 800CC47C 000C93BC C8 42 8D E8 */ lfd f2, lbl_80517148@sda21(r2)
/* 800CC480 000C93C0 FC 61 07 FA */ fmadd f3, f1, f31, f0
/* 800CC484 000C93C4 C8 82 8D B0 */ lfd f4, lbl_80517110@sda21(r2)
/* 800CC488 000C93C8 C8 22 8D E0 */ lfd f1, lbl_80517140@sda21(r2)
/* 800CC48C 000C93CC FC DF 31 FA */ fmadd f6, f31, f7, f6
/* 800CC490 000C93D0 C8 02 8D A8 */ lfd f0, lbl_80517108@sda21(r2)
/* 800CC494 000C93D4 FC 5F 10 FA */ fmadd f2, f31, f3, f2
/* 800CC498 000C93D8 C8 E1 00 08 */ lfd f7, 8(r1)
/* 800CC49C 000C93DC FC 7F 29 BA */ fmadd f3, f31, f6, f5
/* 800CC4A0 000C93E0 FC 3F 08 BA */ fmadd f1, f31, f2, f1
/* 800CC4A4 000C93E4 FC 5F 20 FA */ fmadd f2, f31, f3, f4
/* 800CC4A8 000C93E8 FC 1F 00 7A */ fmadd f0, f31, f1, f0
/* 800CC4AC 000C93EC FC 3F 00 B2 */ fmul f1, f31, f2
/* 800CC4B0 000C93F0 FC 01 00 24 */ fdiv f0, f1, f0
/* 800CC4B4 000C93F4 FC 27 38 3A */ fmadd f1, f7, f0, f7
/* 800CC4B8 000C93F8 D8 01 00 10 */ stfd f0, 0x10(r1)
/* 800CC4BC 000C93FC 48 00 00 FC */ b lbl_800CC5B8
lbl_800CC4C0:
/* 800CC4C0 000C9400 FC 20 0A 10 */ fabs f1, f1
/* 800CC4C4 000C9404 C9 22 8D A8 */ lfd f9, lbl_80517108@sda21(r2)
/* 800CC4C8 000C9408 C8 02 8E 00 */ lfd f0, lbl_80517160@sda21(r2)
/* 800CC4CC 000C940C C8 E2 8D D8 */ lfd f7, lbl_80517138@sda21(r2)
/* 800CC4D0 000C9410 FD 09 08 28 */ fsub f8, f9, f1
/* 800CC4D4 000C9414 C8 62 8D D0 */ lfd f3, lbl_80517130@sda21(r2)
/* 800CC4D8 000C9418 C8 C2 8D C8 */ lfd f6, lbl_80517128@sda21(r2)
/* 800CC4DC 000C941C C8 A2 8D C0 */ lfd f5, lbl_80517120@sda21(r2)
/* 800CC4E0 000C9420 FF E0 02 32 */ fmul f31, f0, f8
/* 800CC4E4 000C9424 C8 42 8D F8 */ lfd f2, lbl_80517158@sda21(r2)
/* 800CC4E8 000C9428 C8 02 8D F0 */ lfd f0, lbl_80517150@sda21(r2)
/* 800CC4EC 000C942C C8 82 8D B8 */ lfd f4, lbl_80517118@sda21(r2)
/* 800CC4F0 000C9430 C8 22 8D E8 */ lfd f1, lbl_80517148@sda21(r2)
/* 800CC4F4 000C9434 FC E7 1F FA */ fmadd f7, f7, f31, f3
/* 800CC4F8 000C9438 C8 62 8D B0 */ lfd f3, lbl_80517110@sda21(r2)
/* 800CC4FC 000C943C FC 42 07 FA */ fmadd f2, f2, f31, f0
/* 800CC500 000C9440 C8 02 8D E0 */ lfd f0, lbl_80517140@sda21(r2)
/* 800CC504 000C9444 D9 01 00 10 */ stfd f8, 0x10(r1)
/* 800CC508 000C9448 FC DF 31 FA */ fmadd f6, f31, f7, f6
/* 800CC50C 000C944C FC 3F 08 BA */ fmadd f1, f31, f2, f1
/* 800CC510 000C9450 FC 5F 29 BA */ fmadd f2, f31, f6, f5
/* 800CC514 000C9454 FC 1F 00 7A */ fmadd f0, f31, f1, f0
/* 800CC518 000C9458 FC 3F 20 BA */ fmadd f1, f31, f2, f4
/* 800CC51C 000C945C FF BF 48 3A */ fmadd f29, f31, f0, f9
/* 800CC520 000C9460 FC 1F 18 7A */ fmadd f0, f31, f1, f3
/* 800CC524 000C9464 FC 20 F8 90 */ fmr f1, f31
/* 800CC528 000C9468 FF DF 00 32 */ fmul f30, f31, f0
/* 800CC52C 000C946C 48 00 37 91 */ bl sqrt
/* 800CC530 000C9470 3C 60 3F EF */ lis r3, 0x3FEF3333@ha
/* 800CC534 000C9474 38 03 33 33 */ addi r0, r3, 0x3FEF3333@l
/* 800CC538 000C9478 7C 1E 00 00 */ cmpw r30, r0
/* 800CC53C 000C947C 41 80 00 28 */ blt lbl_800CC564
/* 800CC540 000C9480 FC 9E E8 24 */ fdiv f4, f30, f29
/* 800CC544 000C9484 C8 42 8E 08 */ lfd f2, lbl_80517168@sda21(r2)
/* 800CC548 000C9488 C8 02 8D 98 */ lfd f0, lbl_805170F8@sda21(r2)
/* 800CC54C 000C948C C8 62 8D 90 */ lfd f3, lbl_805170F0@sda21(r2)
/* 800CC550 000C9490 FC 21 09 3A */ fmadd f1, f1, f4, f1
/* 800CC554 000C9494 D8 81 00 10 */ stfd f4, 0x10(r1)
/* 800CC558 000C9498 FC 02 00 78 */ fmsub f0, f2, f1, f0
/* 800CC55C 000C949C FC 23 00 28 */ fsub f1, f3, f0
/* 800CC560 000C94A0 48 00 00 48 */ b lbl_800CC5A8
lbl_800CC564:
/* 800CC564 000C94A4 D8 21 00 10 */ stfd f1, 0x10(r1)
/* 800CC568 000C94A8 38 00 00 00 */ li r0, 0
/* 800CC56C 000C94AC C8 E2 8E 08 */ lfd f7, lbl_80517168@sda21(r2)
/* 800CC570 000C94B0 FC BE E8 24 */ fdiv f5, f30, f29
/* 800CC574 000C94B4 90 01 00 14 */ stw r0, 0x14(r1)
/* 800CC578 000C94B8 C8 02 8D 98 */ lfd f0, lbl_805170F8@sda21(r2)
/* 800CC57C 000C94BC C9 01 00 10 */ lfd f8, 0x10(r1)
/* 800CC580 000C94C0 C8 42 8E 10 */ lfd f2, lbl_80517170@sda21(r2)
/* 800CC584 000C94C4 FC 88 FA 3C */ fnmsub f4, f8, f8, f31
/* 800CC588 000C94C8 FC 61 40 2A */ fadd f3, f1, f8
/* 800CC58C 000C94CC FC C7 00 72 */ fmul f6, f7, f1
/* 800CC590 000C94D0 FC 24 18 24 */ fdiv f1, f4, f3
/* 800CC594 000C94D4 FC 27 00 7C */ fnmsub f1, f7, f1, f0
/* 800CC598 000C94D8 FC 07 12 3C */ fnmsub f0, f7, f8, f2
/* 800CC59C 000C94DC FC 26 09 78 */ fmsub f1, f6, f5, f1
/* 800CC5A0 000C94E0 FC 01 00 28 */ fsub f0, f1, f0
/* 800CC5A4 000C94E4 FC 22 00 28 */ fsub f1, f2, f0
lbl_800CC5A8:
/* 800CC5A8 000C94E8 2C 1F 00 00 */ cmpwi r31, 0
/* 800CC5AC 000C94EC 40 81 00 08 */ ble lbl_800CC5B4
/* 800CC5B0 000C94F0 48 00 00 08 */ b lbl_800CC5B8
lbl_800CC5B4:
/* 800CC5B4 000C94F4 FC 20 08 50 */ fneg f1, f1
lbl_800CC5B8:
/* 800CC5B8 000C94F8 E3 E1 00 48 */ psq_l f31, 72(r1), 0, qr0
/* 800CC5BC 000C94FC CB E1 00 40 */ lfd f31, 0x40(r1)
/* 800CC5C0 000C9500 E3 C1 00 38 */ psq_l f30, 56(r1), 0, qr0
/* 800CC5C4 000C9504 CB C1 00 30 */ lfd f30, 0x30(r1)
/* 800CC5C8 000C9508 E3 A1 00 28 */ psq_l f29, 40(r1), 0, qr0
/* 800CC5CC 000C950C CB A1 00 20 */ lfd f29, 0x20(r1)
/* 800CC5D0 000C9510 83 E1 00 1C */ lwz r31, 0x1c(r1)
/* 800CC5D4 000C9514 80 01 00 54 */ lwz r0, 0x54(r1)
/* 800CC5D8 000C9518 83 C1 00 18 */ lwz r30, 0x18(r1)
/* 800CC5DC 000C951C 7C 08 03 A6 */ mtlr r0
/* 800CC5E0 000C9520 38 21 00 50 */ addi r1, r1, 0x50
/* 800CC5E4 000C9524 4E 80 00 20 */ blr

View File

@ -1,170 +0,0 @@
.include "macros.inc"
.section .sdata2, "a" # 0x80516360 - 0x80520E40
.balign 8
.global lbl_805175E8
lbl_805175E8:
.4byte 0x3FF00000
.4byte 0x00000000
.section .text, "ax" # 0x800056C0 - 0x80472F00
.global __ieee754_sqrt
__ieee754_sqrt:
/* 800CFA2C 000CC96C 94 21 FF E0 */ stwu r1, -0x20(r1)
/* 800CFA30 000CC970 D8 21 00 08 */ stfd f1, 8(r1)
/* 800CFA34 000CC974 80 C1 00 08 */ lwz r6, 8(r1)
/* 800CFA38 000CC978 80 01 00 0C */ lwz r0, 0xc(r1)
/* 800CFA3C 000CC97C 54 C3 00 56 */ rlwinm r3, r6, 0, 1, 0xb
/* 800CFA40 000CC980 3C 63 80 10 */ addis r3, r3, 0x8010
/* 800CFA44 000CC984 28 03 00 00 */ cmplwi r3, 0
/* 800CFA48 000CC988 40 82 00 14 */ bne lbl_800CFA5C
/* 800CFA4C 000CC98C FC 21 08 7A */ fmadd f1, f1, f1, f1
/* 800CFA50 000CC990 38 00 00 21 */ li r0, 0x21
/* 800CFA54 000CC994 90 0D 8C C0 */ stw r0, errno@sda21(r13)
/* 800CFA58 000CC998 48 00 01 F0 */ b lbl_800CFC48
lbl_800CFA5C:
/* 800CFA5C 000CC99C 2C 06 00 00 */ cmpwi r6, 0
/* 800CFA60 000CC9A0 41 81 00 30 */ bgt lbl_800CFA90
/* 800CFA64 000CC9A4 54 C3 00 7E */ clrlwi r3, r6, 1
/* 800CFA68 000CC9A8 7C 03 1B 79 */ or. r3, r0, r3
/* 800CFA6C 000CC9AC 40 82 00 08 */ bne lbl_800CFA74
/* 800CFA70 000CC9B0 48 00 01 D8 */ b lbl_800CFC48
lbl_800CFA74:
/* 800CFA74 000CC9B4 2C 06 00 00 */ cmpwi r6, 0
/* 800CFA78 000CC9B8 40 80 00 18 */ bge lbl_800CFA90
/* 800CFA7C 000CC9BC 3C 60 80 51 */ lis r3, __float_nan@ha
/* 800CFA80 000CC9C0 38 00 00 21 */ li r0, 0x21
/* 800CFA84 000CC9C4 90 0D 8C C0 */ stw r0, errno@sda21(r13)
/* 800CFA88 000CC9C8 C0 23 48 B0 */ lfs f1, __float_nan@l(r3)
/* 800CFA8C 000CC9CC 48 00 01 BC */ b lbl_800CFC48
lbl_800CFA90:
/* 800CFA90 000CC9D0 7C C3 A6 71 */ srawi. r3, r6, 0x14
/* 800CFA94 000CC9D4 40 82 00 50 */ bne lbl_800CFAE4
/* 800CFA98 000CC9D8 48 00 00 14 */ b lbl_800CFAAC
lbl_800CFA9C:
/* 800CFA9C 000CC9DC 54 04 AA FE */ srwi r4, r0, 0xb
/* 800CFAA0 000CC9E0 54 00 A8 14 */ slwi r0, r0, 0x15
/* 800CFAA4 000CC9E4 7C C6 23 78 */ or r6, r6, r4
/* 800CFAA8 000CC9E8 38 63 FF EB */ addi r3, r3, -21
lbl_800CFAAC:
/* 800CFAAC 000CC9EC 2C 06 00 00 */ cmpwi r6, 0
/* 800CFAB0 000CC9F0 41 82 FF EC */ beq lbl_800CFA9C
/* 800CFAB4 000CC9F4 38 E0 00 00 */ li r7, 0
/* 800CFAB8 000CC9F8 48 00 00 0C */ b lbl_800CFAC4
lbl_800CFABC:
/* 800CFABC 000CC9FC 54 C6 08 3C */ slwi r6, r6, 1
/* 800CFAC0 000CCA00 38 E7 00 01 */ addi r7, r7, 1
lbl_800CFAC4:
/* 800CFAC4 000CCA04 54 C4 02 D7 */ rlwinm. r4, r6, 0, 0xb, 0xb
/* 800CFAC8 000CCA08 41 82 FF F4 */ beq lbl_800CFABC
/* 800CFACC 000CCA0C 20 87 00 20 */ subfic r4, r7, 0x20
/* 800CFAD0 000CCA10 38 A7 FF FF */ addi r5, r7, -1
/* 800CFAD4 000CCA14 7C 04 24 30 */ srw r4, r0, r4
/* 800CFAD8 000CCA18 7C 00 38 30 */ slw r0, r0, r7
/* 800CFADC 000CCA1C 7C 65 18 50 */ subf r3, r5, r3
/* 800CFAE0 000CCA20 7C C6 23 78 */ or r6, r6, r4
lbl_800CFAE4:
/* 800CFAE4 000CCA24 38 83 FC 01 */ addi r4, r3, -1023
/* 800CFAE8 000CCA28 54 C5 03 3E */ clrlwi r5, r6, 0xc
/* 800CFAEC 000CCA2C 54 84 07 FF */ clrlwi. r4, r4, 0x1f
/* 800CFAF0 000CCA30 64 A5 00 10 */ oris r5, r5, 0x10
/* 800CFAF4 000CCA34 41 82 00 14 */ beq lbl_800CFB08
/* 800CFAF8 000CCA38 54 04 0F FE */ srwi r4, r0, 0x1f
/* 800CFAFC 000CCA3C 7C 00 02 14 */ add r0, r0, r0
/* 800CFB00 000CCA40 7C 84 2A 14 */ add r4, r4, r5
/* 800CFB04 000CCA44 7C A5 22 14 */ add r5, r5, r4
lbl_800CFB08:
/* 800CFB08 000CCA48 54 04 0F FE */ srwi r4, r0, 0x1f
/* 800CFB0C 000CCA4C 7C 00 02 14 */ add r0, r0, r0
/* 800CFB10 000CCA50 7C 84 2A 14 */ add r4, r4, r5
/* 800CFB14 000CCA54 39 20 00 00 */ li r9, 0
/* 800CFB18 000CCA58 7C A5 22 14 */ add r5, r5, r4
/* 800CFB1C 000CCA5C 39 60 00 00 */ li r11, 0
/* 800CFB20 000CCA60 39 40 00 00 */ li r10, 0
/* 800CFB24 000CCA64 39 80 00 00 */ li r12, 0
/* 800CFB28 000CCA68 3C C0 00 20 */ lis r6, 0x20
/* 800CFB2C 000CCA6C 48 00 00 30 */ b lbl_800CFB5C
lbl_800CFB30:
/* 800CFB30 000CCA70 7C 8B 32 14 */ add r4, r11, r6
/* 800CFB34 000CCA74 7C 04 28 00 */ cmpw r4, r5
/* 800CFB38 000CCA78 41 81 00 10 */ bgt lbl_800CFB48
/* 800CFB3C 000CCA7C 7D 64 32 14 */ add r11, r4, r6
/* 800CFB40 000CCA80 7C A4 28 50 */ subf r5, r4, r5
/* 800CFB44 000CCA84 7D 8C 32 14 */ add r12, r12, r6
lbl_800CFB48:
/* 800CFB48 000CCA88 54 04 0F FE */ srwi r4, r0, 0x1f
/* 800CFB4C 000CCA8C 7C 00 02 14 */ add r0, r0, r0
/* 800CFB50 000CCA90 7C 84 2A 14 */ add r4, r4, r5
/* 800CFB54 000CCA94 54 C6 F8 7E */ srwi r6, r6, 1
/* 800CFB58 000CCA98 7C A5 22 14 */ add r5, r5, r4
lbl_800CFB5C:
/* 800CFB5C 000CCA9C 28 06 00 00 */ cmplwi r6, 0
/* 800CFB60 000CCAA0 40 82 FF D0 */ bne lbl_800CFB30
/* 800CFB64 000CCAA4 3C C0 80 00 */ lis r6, 0x8000
/* 800CFB68 000CCAA8 48 00 00 6C */ b lbl_800CFBD4
lbl_800CFB6C:
/* 800CFB6C 000CCAAC 7C 0B 28 00 */ cmpw r11, r5
/* 800CFB70 000CCAB0 7D 67 5B 78 */ mr r7, r11
/* 800CFB74 000CCAB4 7D 09 32 14 */ add r8, r9, r6
/* 800CFB78 000CCAB8 41 80 00 10 */ blt lbl_800CFB88
/* 800CFB7C 000CCABC 40 82 00 44 */ bne lbl_800CFBC0
/* 800CFB80 000CCAC0 7C 08 00 40 */ cmplw r8, r0
/* 800CFB84 000CCAC4 41 81 00 3C */ bgt lbl_800CFBC0
lbl_800CFB88:
/* 800CFB88 000CCAC8 55 04 00 00 */ rlwinm r4, r8, 0, 0, 0
/* 800CFB8C 000CCACC 7D 28 32 14 */ add r9, r8, r6
/* 800CFB90 000CCAD0 3C 84 80 00 */ addis r4, r4, 0x8000
/* 800CFB94 000CCAD4 28 04 00 00 */ cmplwi r4, 0
/* 800CFB98 000CCAD8 40 82 00 10 */ bne lbl_800CFBA8
/* 800CFB9C 000CCADC 55 24 00 01 */ rlwinm. r4, r9, 0, 0, 0
/* 800CFBA0 000CCAE0 40 82 00 08 */ bne lbl_800CFBA8
/* 800CFBA4 000CCAE4 39 6B 00 01 */ addi r11, r11, 1
lbl_800CFBA8:
/* 800CFBA8 000CCAE8 7C 00 40 40 */ cmplw r0, r8
/* 800CFBAC 000CCAEC 7C A7 28 50 */ subf r5, r7, r5
/* 800CFBB0 000CCAF0 40 80 00 08 */ bge lbl_800CFBB8
/* 800CFBB4 000CCAF4 38 A5 FF FF */ addi r5, r5, -1
lbl_800CFBB8:
/* 800CFBB8 000CCAF8 7C 08 00 50 */ subf r0, r8, r0
/* 800CFBBC 000CCAFC 7D 4A 32 14 */ add r10, r10, r6
lbl_800CFBC0:
/* 800CFBC0 000CCB00 54 04 0F FE */ srwi r4, r0, 0x1f
/* 800CFBC4 000CCB04 7C 00 02 14 */ add r0, r0, r0
/* 800CFBC8 000CCB08 7C 84 2A 14 */ add r4, r4, r5
/* 800CFBCC 000CCB0C 54 C6 F8 7E */ srwi r6, r6, 1
/* 800CFBD0 000CCB10 7C A5 22 14 */ add r5, r5, r4
lbl_800CFBD4:
/* 800CFBD4 000CCB14 28 06 00 00 */ cmplwi r6, 0
/* 800CFBD8 000CCB18 40 82 FF 94 */ bne lbl_800CFB6C
/* 800CFBDC 000CCB1C 7C A0 03 79 */ or. r0, r5, r0
/* 800CFBE0 000CCB20 41 82 00 30 */ beq lbl_800CFC10
/* 800CFBE4 000CCB24 C8 02 92 88 */ lfd f0, lbl_805175E8@sda21(r2)
/* 800CFBE8 000CCB28 3C 0A 00 01 */ addis r0, r10, 1
/* 800CFBEC 000CCB2C 28 00 FF FF */ cmplwi r0, 0xffff
/* 800CFBF0 000CCB30 D8 01 00 10 */ stfd f0, 0x10(r1)
/* 800CFBF4 000CCB34 D8 01 00 10 */ stfd f0, 0x10(r1)
/* 800CFBF8 000CCB38 40 82 00 10 */ bne lbl_800CFC08
/* 800CFBFC 000CCB3C 39 40 00 00 */ li r10, 0
/* 800CFC00 000CCB40 39 8C 00 01 */ addi r12, r12, 1
/* 800CFC04 000CCB44 48 00 00 0C */ b lbl_800CFC10
lbl_800CFC08:
/* 800CFC08 000CCB48 55 40 07 FE */ clrlwi r0, r10, 0x1f
/* 800CFC0C 000CCB4C 7D 4A 02 14 */ add r10, r10, r0
lbl_800CFC10:
/* 800CFC10 000CCB50 55 80 07 FE */ clrlwi r0, r12, 0x1f
/* 800CFC14 000CCB54 7D 84 0E 70 */ srawi r4, r12, 1
/* 800CFC18 000CCB58 2C 00 00 01 */ cmpwi r0, 1
/* 800CFC1C 000CCB5C 55 45 F8 7E */ srwi r5, r10, 1
/* 800CFC20 000CCB60 3C 84 3F E0 */ addis r4, r4, 0x3fe0
/* 800CFC24 000CCB64 40 82 00 08 */ bne lbl_800CFC2C
/* 800CFC28 000CCB68 64 A5 80 00 */ oris r5, r5, 0x8000
lbl_800CFC2C:
/* 800CFC2C 000CCB6C 38 03 FC 01 */ addi r0, r3, -1023
/* 800CFC30 000CCB70 90 A1 00 14 */ stw r5, 0x14(r1)
/* 800CFC34 000CCB74 7C 00 0E 70 */ srawi r0, r0, 1
/* 800CFC38 000CCB78 54 00 A0 16 */ slwi r0, r0, 0x14
/* 800CFC3C 000CCB7C 7C 84 02 14 */ add r4, r4, r0
/* 800CFC40 000CCB80 90 81 00 10 */ stw r4, 0x10(r1)
/* 800CFC44 000CCB84 C8 21 00 10 */ lfd f1, 0x10(r1)
lbl_800CFC48:
/* 800CFC48 000CCB88 38 21 00 20 */ addi r1, r1, 0x20
/* 800CFC4C 000CCB8C 4E 80 00 20 */ blr

View File

@ -306,7 +306,7 @@ DOLPHIN:=\
$(BUILD_DIR)/asm/Dolphin/strtoul.o\
$(BUILD_DIR)/src/Dolphin/wchar_io.o\
$(BUILD_DIR)/src/Dolphin/uart_console_io_gcn.o\
$(BUILD_DIR)/asm/Dolphin/e_asin.o\
$(BUILD_DIR)/src/Dolphin/e_asin.o\
$(BUILD_DIR)/src/Dolphin/e_atan2.o\
$(BUILD_DIR)/src/Dolphin/e_exp.o\
$(BUILD_DIR)/src/Dolphin/e_fmod.o\
@ -334,7 +334,7 @@ DOLPHIN:=\
$(BUILD_DIR)/src/Dolphin/w_fmod.o\
$(BUILD_DIR)/src/Dolphin/w_log10.o\
$(BUILD_DIR)/src/Dolphin/w_pow.o\
$(BUILD_DIR)/asm/Dolphin/e_sqrt.o\
$(BUILD_DIR)/src/Dolphin/e_sqrt.o\
$(BUILD_DIR)/src/Dolphin/math_ppc.o\
$(BUILD_DIR)/src/Dolphin/w_sqrt.o\
$(BUILD_DIR)/asm/Dolphin/extras.o\

View File

@ -42,6 +42,8 @@
*/
#include "fdlibm.h"
#include "Dolphin/float.h"
#include "Dolphin/math.h"
#ifdef __STDC__
static const double
@ -79,7 +81,7 @@ double __ieee754_asin(x) double x;
if (((ix - 0x3ff00000) | __LO(x)) == 0)
/* asin(1)=+-pi/2 with inexact */
return x * pio2_hi + x * pio2_lo;
return (x - x) / (x - x); /* asin(|x|>1) is NaN */
return __float_nan; /* asin(|x|>1) is NaN */
} else if (ix < 0x3fe00000) { /* |x|<0.5 */
if (ix < 0x3e400000) { /* if |x| < 2**-27 */
if (huge + x > one)
@ -92,7 +94,7 @@ double __ieee754_asin(x) double x;
return x + x * w;
}
/* 1> |x|>= 0.5 */
w = one - __fabs(x);
w = one - fabs(x);
t = w * 0.5;
p = t * (pS0 + t * (pS1 + t * (pS2 + t * (pS3 + t * (pS4 + t * pS5)))));
q = one + t * (qS1 + t * (qS2 + t * (qS3 + t * qS4)));

View File

@ -1,194 +1,468 @@
/*
* --INFO--
* Address: 800CFA2C
* Size: 000224
*/
void __ieee754_sqrt(void)
/* @(#)e_sqrt.c 1.3 95/01/18 */
/*
* ====================================================
* Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
*
* Developed at SunSoft, a Sun Microsystems, Inc. business.
* Permission to use, copy, modify, and distribute this
* software is freely granted, provided that this notice
* is preserved.
* ====================================================
*/
/* __ieee754_sqrt(x)
* Return correctly rounded sqrt.
* ------------------------------------------
* | Use the hardware sqrt if you have one |
* ------------------------------------------
* Method:
* Bit by bit method using integer arithmetic. (Slow, but portable)
* 1. Normalization
* Scale x to y in [1,4) with even powers of 2:
* find an integer k such that 1 <= (y=x*2^(2k)) < 4, then
* sqrt(x) = 2^k * sqrt(y)
* 2. Bit by bit computation
* Let q = sqrt(y) truncated to i bit after binary point (q = 1),
* i 0
* i+1 2
* s = 2*q , and y = 2 * ( y - q ). (1)
* i i i i
*
* To compute q from q , one checks whether
* i+1 i
*
* -(i+1) 2
* (q + 2 ) <= y. (2)
* i
* -(i+1)
* If (2) is false, then q = q ; otherwise q = q + 2 .
* i+1 i i+1 i
*
* With some algebric manipulation, it is not difficult to see
* that (2) is equivalent to
* -(i+1)
* s + 2 <= y (3)
* i i
*
* The advantage of (3) is that s and y can be computed by
* i i
* the following recurrence formula:
* if (3) is false
*
* s = s , y = y ; (4)
* i+1 i i+1 i
*
* otherwise,
* -i -(i+1)
* s = s + 2 , y = y - s - 2 (5)
* i+1 i i+1 i i
*
* One may easily use induction to prove (4) and (5).
* Note. Since the left hand side of (3) contain only i+2 bits,
* it does not necessary to do a full (53-bit) comparison
* in (3).
* 3. Final rounding
* After generating the 53 bits result, we compute one more bit.
* Together with the remainder, we can decide whether the
* result is exact, bigger than 1/2ulp, or less than 1/2ulp
* (it will never equal to 1/2ulp).
* The rounding mode can be detected by checking whether
* huge + tiny is equal to huge, and whether huge - tiny is
* equal to huge for some floating point number "huge" and "tiny".
*
* Special cases:
* sqrt(+-0) = +-0 ... exact
* sqrt(inf) = inf
* sqrt(-ve) = NaN ... with invalid signal
* sqrt(NaN) = NaN ... with invalid signal for signaling NaN
*
* Other methods : see the appended file at the end of the program below.
*---------------
*/
#include "fdlibm.h"
#include "errno.h"
#include "Dolphin/float.h"
#ifdef __STDC__
static const double one = 1.0, tiny = 1.0e-300;
#else
static double one = 1.0, tiny = 1.0e-300;
#endif
#ifdef __STDC__
double __ieee754_sqrt(double x)
#else
double __ieee754_sqrt(x) double x;
#endif
{
/*
.loc_0x0:
stwu r1, -0x20(r1)
stfd f1, 0x8(r1)
lwz r6, 0x8(r1)
lwz r0, 0xC(r1)
rlwinm r3,r6,0,1,11
subis r3, r3, 0x7FF0
cmplwi r3, 0
bne- .loc_0x30
fmadd f1, f1, f1, f1
li r0, 0x21
stw r0, -0x7340(r13)
b .loc_0x21C
double z;
int sign = (int)0x80000000;
unsigned r, t1, s1, ix1, q1;
int ix0, s0, q, m, t, i;
.loc_0x30:
cmpwi r6, 0
bgt- .loc_0x64
rlwinm r3,r6,0,1,31
or. r3, r0, r3
bne- .loc_0x48
b .loc_0x21C
ix0 = __HI(x); /* high word of x */
ix1 = __LO(x); /* low word of x */
.loc_0x48:
cmpwi r6, 0
bge- .loc_0x64
lis r3, 0x8051
li r0, 0x21
stw r0, -0x7340(r13)
lfs f1, 0x48B0(r3)
b .loc_0x21C
/* take care of Inf and NaN */
if ((ix0 & 0x7ff00000) == 0x7ff00000) {
errno = 33;
return x * x + x; /* sqrt(NaN)=NaN, sqrt(+inf)=+inf
sqrt(-inf)=sNaN */
}
/* take care of zero */
if (ix0 <= 0) {
if (((ix0 & (~sign)) | ix1) == 0)
return x; /* sqrt(+-0) = +-0 */
else if (ix0 < 0) {
errno = 33;
return __float_nan;
} /* sqrt(-ve) = sNaN */
}
/* normalize x */
m = (ix0 >> 20);
if (m == 0) { /* subnormal x */
while (ix0 == 0) {
m -= 21;
ix0 |= (ix1 >> 11);
ix1 <<= 21;
}
for (i = 0; (ix0 & 0x00100000) == 0; i++)
ix0 <<= 1;
m -= i - 1;
ix0 |= (ix1 >> (32 - i));
ix1 <<= i;
}
m -= 1023; /* unbias exponent */
ix0 = (ix0 & 0x000fffff) | 0x00100000;
if (m & 1) { /* odd m, double x to make it even */
ix0 += ix0 + ((ix1 & sign) >> 31);
ix1 += ix1;
}
m >>= 1; /* m = [m/2] */
.loc_0x64:
srawi. r3, r6, 0x14
bne- .loc_0xB8
b .loc_0x80
/* generate sqrt(x) bit by bit */
ix0 += ix0 + ((ix1 & sign) >> 31);
ix1 += ix1;
q = q1 = s0 = s1 = 0; /* [q,q1] = sqrt(x) */
r = 0x00200000; /* r = moving bit from right to left */
.loc_0x70:
rlwinm r4,r0,21,11,31
rlwinm r0,r0,21,0,10
or r6, r6, r4
subi r3, r3, 0x15
while (r != 0) {
t = s0 + r;
if (t <= ix0) {
s0 = t + r;
ix0 -= t;
q += r;
}
ix0 += ix0 + ((ix1 & sign) >> 31);
ix1 += ix1;
r >>= 1;
}
.loc_0x80:
cmpwi r6, 0
beq+ .loc_0x70
li r7, 0
b .loc_0x98
r = sign;
while (r != 0) {
t1 = s1 + r;
t = s0;
if ((t < ix0) || ((t == ix0) && (t1 <= ix1))) {
s1 = t1 + r;
if (((t1 & sign) == sign) && (s1 & sign) == 0)
s0 += 1;
ix0 -= t;
if (ix1 < t1)
ix0 -= 1;
ix1 -= t1;
q1 += r;
}
ix0 += ix0 + ((ix1 & sign) >> 31);
ix1 += ix1;
r >>= 1;
}
.loc_0x90:
rlwinm r6,r6,1,0,30
addi r7, r7, 0x1
.loc_0x98:
rlwinm. r4,r6,0,11,11
beq+ .loc_0x90
subfic r4, r7, 0x20
subi r5, r7, 0x1
srw r4, r0, r4
slw r0, r0, r7
sub r3, r3, r5
or r6, r6, r4
.loc_0xB8:
subi r4, r3, 0x3FF
rlwinm r5,r6,0,12,31
rlwinm. r4,r4,0,31,31
oris r5, r5, 0x10
beq- .loc_0xDC
rlwinm r4,r0,1,31,31
add r0, r0, r0
add r4, r4, r5
add r5, r5, r4
.loc_0xDC:
rlwinm r4,r0,1,31,31
add r0, r0, r0
add r4, r4, r5
li r9, 0
add r5, r5, r4
li r11, 0
li r10, 0
li r12, 0
lis r6, 0x20
b .loc_0x130
.loc_0x104:
add r4, r11, r6
cmpw r4, r5
bgt- .loc_0x11C
add r11, r4, r6
sub r5, r5, r4
add r12, r12, r6
.loc_0x11C:
rlwinm r4,r0,1,31,31
add r0, r0, r0
add r4, r4, r5
rlwinm r6,r6,31,1,31
add r5, r5, r4
.loc_0x130:
cmplwi r6, 0
bne+ .loc_0x104
lis r6, 0x8000
b .loc_0x1A8
.loc_0x140:
cmpw r11, r5
mr r7, r11
add r8, r9, r6
blt- .loc_0x15C
bne- .loc_0x194
cmplw r8, r0
bgt- .loc_0x194
.loc_0x15C:
rlwinm r4,r8,0,0,0
add r9, r8, r6
subis r4, r4, 0x8000
cmplwi r4, 0
bne- .loc_0x17C
rlwinm. r4,r9,0,0,0
bne- .loc_0x17C
addi r11, r11, 0x1
.loc_0x17C:
cmplw r0, r8
sub r5, r5, r7
bge- .loc_0x18C
subi r5, r5, 0x1
.loc_0x18C:
sub r0, r0, r8
add r10, r10, r6
.loc_0x194:
rlwinm r4,r0,1,31,31
add r0, r0, r0
add r4, r4, r5
rlwinm r6,r6,31,1,31
add r5, r5, r4
.loc_0x1A8:
cmplwi r6, 0
bne+ .loc_0x140
or. r0, r5, r0
beq- .loc_0x1E4
lfd f0, -0x6D78(r2)
addis r0, r10, 0x1
cmplwi r0, 0xFFFF
stfd f0, 0x10(r1)
stfd f0, 0x10(r1)
bne- .loc_0x1DC
li r10, 0
addi r12, r12, 0x1
b .loc_0x1E4
.loc_0x1DC:
rlwinm r0,r10,0,31,31
add r10, r10, r0
.loc_0x1E4:
rlwinm r0,r12,0,31,31
srawi r4, r12, 0x1
cmpwi r0, 0x1
rlwinm r5,r10,31,1,31
addis r4, r4, 0x3FE0
bne- .loc_0x200
oris r5, r5, 0x8000
.loc_0x200:
subi r0, r3, 0x3FF
stw r5, 0x14(r1)
srawi r0, r0, 0x1
rlwinm r0,r0,20,0,11
add r4, r4, r0
stw r4, 0x10(r1)
lfd f1, 0x10(r1)
.loc_0x21C:
addi r1, r1, 0x20
blr
*/
/* use floating add to find out rounding direction */
if ((ix0 | ix1) != 0) {
z = one - tiny; /* trigger inexact flag */
if (z >= one) {
z = one + tiny;
if (q1 == (unsigned)0xffffffff) {
q1 = 0;
q += 1;
} else if (z > one) {
if (q1 == (unsigned)0xfffffffe)
q += 1;
q1 += 2;
} else
q1 += (q1 & 1);
}
}
ix0 = (q >> 1) + 0x3fe00000;
ix1 = q1 >> 1;
if ((q & 1) == 1)
ix1 |= sign;
ix0 += (m << 20);
__HI(z) = ix0;
__LO(z) = ix1;
return z;
}
/*
Other methods (use floating-point arithmetic)
-------------
(This is a copy of a drafted paper by Prof W. Kahan
and K.C. Ng, written in May, 1986)
Two algorithms are given here to implement sqrt(x)
(IEEE double precision arithmetic) in software.
Both supply sqrt(x) correctly rounded. The first algorithm (in
Section A) uses newton iterations and involves four divisions.
The second one uses reciproot iterations to avoid division, but
requires more multiplications. Both algorithms need the ability
to chop results of arithmetic operations instead of round them,
and the INEXACT flag to indicate when an arithmetic operation
is executed exactly with no roundoff error, all part of the
standard (IEEE 754-1985). The ability to perform shift, add,
subtract and logical AND operations upon 32-bit words is needed
too, though not part of the standard.
A. sqrt(x) by Newton Iteration
(1) Initial approximation
Let x0 and x1 be the leading and the trailing 32-bit words of
a floating point number x (in IEEE double format) respectively
1 11 52 ...widths
------------------------------------------------------
x: |s| e | f |
------------------------------------------------------
msb lsb msb lsb ...order
------------------------ ------------------------
x0: |s| e | f1 | x1: | f2 |
------------------------ ------------------------
By performing shifts and subtracts on x0 and x1 (both regarded
as integers), we obtain an 8-bit approximation of sqrt(x) as
follows.
k := (x0>>1) + 0x1ff80000;
y0 := k - T1[31&(k>>15)]. ... y ~ sqrt(x) to 8 bits
Here k is a 32-bit integer and T1[] is an integer array containing
correction terms. Now magically the floating value of y (y's
leading 32-bit word is y0, the value of its trailing word is 0)
approximates sqrt(x) to almost 8-bit.
Value of T1:
static int T1[32]= {
0, 1024, 3062, 5746, 9193, 13348, 18162, 23592,
29598, 36145, 43202, 50740, 58733, 67158, 75992, 85215,
83599, 71378, 60428, 50647, 41945, 34246, 27478, 21581,
16499, 12183, 8588, 5674, 3403, 1742, 661, 130,};
(2) Iterative refinement
Apply Heron's rule three times to y, we have y approximates
sqrt(x) to within 1 ulp (Unit in the Last Place):
y := (y+x/y)/2 ... almost 17 sig. bits
y := (y+x/y)/2 ... almost 35 sig. bits
y := y-(y-x/y)/2 ... within 1 ulp
Remark 1.
Another way to improve y to within 1 ulp is:
y := (y+x/y) ... almost 17 sig. bits to 2*sqrt(x)
y := y - 0x00100006 ... almost 18 sig. bits to sqrt(x)
2
(x-y )*y
y := y + 2* ---------- ...within 1 ulp
2
3y + x
This formula has one division fewer than the one above; however,
it requires more multiplications and additions. Also x must be
scaled in advance to avoid spurious overflow in evaluating the
expression 3y*y+x. Hence it is not recommended uless division
is slow. If division is very slow, then one should use the
reciproot algorithm given in section B.
(3) Final adjustment
By twiddling y's last bit it is possible to force y to be
correctly rounded according to the prevailing rounding mode
as follows. Let r and i be copies of the rounding mode and
inexact flag before entering the square root program. Also we
use the expression y+-ulp for the next representable floating
numbers (up and down) of y. Note that y+-ulp = either fixed
point y+-1, or multiply y by nextafter(1,+-inf) in chopped
mode.
I := FALSE; ... reset INEXACT flag I
R := RZ; ... set rounding mode to round-toward-zero
z := x/y; ... chopped quotient, possibly inexact
If(not I) then { ... if the quotient is exact
if(z=y) {
I := i; ... restore inexact flag
R := r; ... restore rounded mode
return sqrt(x):=y.
} else {
z := z - ulp; ... special rounding
}
}
i := TRUE; ... sqrt(x) is inexact
If (r=RN) then z=z+ulp ... rounded-to-nearest
If (r=RP) then { ... round-toward-+inf
y = y+ulp; z=z+ulp;
}
y := y+z; ... chopped sum
y0:=y0-0x00100000; ... y := y/2 is correctly rounded.
I := i; ... restore inexact flag
R := r; ... restore rounded mode
return sqrt(x):=y.
(4) Special cases
Square root of +inf, +-0, or NaN is itself;
Square root of a negative number is NaN with invalid signal.
B. sqrt(x) by Reciproot Iteration
(1) Initial approximation
Let x0 and x1 be the leading and the trailing 32-bit words of
a floating point number x (in IEEE double format) respectively
(see section A). By performing shifs and subtracts on x0 and y0,
we obtain a 7.8-bit approximation of 1/sqrt(x) as follows.
k := 0x5fe80000 - (x0>>1);
y0:= k - T2[63&(k>>14)]. ... y ~ 1/sqrt(x) to 7.8 bits
Here k is a 32-bit integer and T2[] is an integer array
containing correction terms. Now magically the floating
value of y (y's leading 32-bit word is y0, the value of
its trailing word y1 is set to zero) approximates 1/sqrt(x)
to almost 7.8-bit.
Value of T2:
static int T2[64]= {
0x1500, 0x2ef8, 0x4d67, 0x6b02, 0x87be, 0xa395, 0xbe7a, 0xd866,
0xf14a, 0x1091b,0x11fcd,0x13552,0x14999,0x15c98,0x16e34,0x17e5f,
0x18d03,0x19a01,0x1a545,0x1ae8a,0x1b5c4,0x1bb01,0x1bfde,0x1c28d,
0x1c2de,0x1c0db,0x1ba73,0x1b11c,0x1a4b5,0x1953d,0x18266,0x16be0,
0x1683e,0x179d8,0x18a4d,0x19992,0x1a789,0x1b445,0x1bf61,0x1c989,
0x1d16d,0x1d77b,0x1dddf,0x1e2ad,0x1e5bf,0x1e6e8,0x1e654,0x1e3cd,
0x1df2a,0x1d635,0x1cb16,0x1be2c,0x1ae4e,0x19bde,0x1868e,0x16e2e,
0x1527f,0x1334a,0x11051,0xe951, 0xbe01, 0x8e0d, 0x5924, 0x1edd,};
(2) Iterative refinement
Apply Reciproot iteration three times to y and multiply the
result by x to get an approximation z that matches sqrt(x)
to about 1 ulp. To be exact, we will have
-1ulp < sqrt(x)-z<1.0625ulp.
... set rounding mode to Round-to-nearest
y := y*(1.5-0.5*x*y*y) ... almost 15 sig. bits to 1/sqrt(x)
y := y*((1.5-2^-30)+0.5*x*y*y)... about 29 sig. bits to 1/sqrt(x)
... special arrangement for better accuracy
z := x*y ... 29 bits to sqrt(x), with z*y<1
z := z + 0.5*z*(1-z*y) ... about 1 ulp to sqrt(x)
Remark 2. The constant 1.5-2^-30 is chosen to bias the error so that
(a) the term z*y in the final iteration is always less than 1;
(b) the error in the final result is biased upward so that
-1 ulp < sqrt(x) - z < 1.0625 ulp
instead of |sqrt(x)-z|<1.03125ulp.
(3) Final adjustment
By twiddling y's last bit it is possible to force y to be
correctly rounded according to the prevailing rounding mode
as follows. Let r and i be copies of the rounding mode and
inexact flag before entering the square root program. Also we
use the expression y+-ulp for the next representable floating
numbers (up and down) of y. Note that y+-ulp = either fixed
point y+-1, or multiply y by nextafter(1,+-inf) in chopped
mode.
R := RZ; ... set rounding mode to round-toward-zero
switch(r) {
case RN: ... round-to-nearest
if(x<= z*(z-ulp)...chopped) z = z - ulp; else
if(x<= z*(z+ulp)...chopped) z = z; else z = z+ulp;
break;
case RZ:case RM: ... round-to-zero or round-to--inf
R:=RP; ... reset rounding mod to round-to-+inf
if(x<z*z ... rounded up) z = z - ulp; else
if(x>=(z+ulp)*(z+ulp) ...rounded up) z = z+ulp;
break;
case RP: ... round-to-+inf
if(x>(z+ulp)*(z+ulp)...chopped) z = z+2*ulp; else
if(x>z*z ...chopped) z = z+ulp;
break;
}
Remark 3. The above comparisons can be done in fixed point. For
example, to compare x and w=z*z chopped, it suffices to compare
x1 and w1 (the trailing parts of x and w), regarding them as
two's complement integers.
...Is z an exact square root?
To determine whether z is an exact square root of x, let z1 be the
trailing part of z, and also let x0 and x1 be the leading and
trailing parts of x.
If ((z1&0x03ffffff)!=0) ... not exact if trailing 26 bits of z!=0
I := 1; ... Raise Inexact flag: z is not exact
else {
j := 1 - [(x0>>20)&1] ... j = logb(x) mod 2
k := z1 >> 26; ... get z's 25-th and 26-th
fraction bits
I := i or (k&j) or ((k&(j+j+1))!=(x1&3));
}
R:= r ... restore rounded mode
return sqrt(x):=z.
If multiplication is cheaper then the foregoing red tape, the
Inexact flag can be evaluated by
I := i;
I := (z*z!=x) or I.
Note that z*z can overwrite I; this value must be sensed if it is
True.
Remark 4. If z*z = x exactly, then bit 25 to bit 0 of z1 must be
zero.
--------------------
z1: | f2 |
--------------------
bit 31 bit 0
Further more, bit 27 and 26 of z1, bit 0 and 1 of x1, and the odd
or even of logb(x) have the following relations:
-------------------------------------------------
bit 27,26 of z1 bit 1,0 of x1 logb(x)
-------------------------------------------------
00 00 odd and even
01 01 even
10 10 odd
10 00 even
11 01 even
-------------------------------------------------
(4) Special cases (see (4) of Section A).
*/