Use the new floating point Montgomery multiply code from Sun on Solaris.

2025-02-15 21:36:20 +00:00 · 2000-12-02 02:37:22 +00:00 · 2000-12-02 02:37:22 +00:00 · 885d29d0b4
commit 885d29d0b4
parent 9187c93f00
2 changed files with 152 additions and 43 deletions
--- a/security/nss/lib/freebl/mpi/Makefile
+++ b/security/nss/lib/freebl/mpi/Makefile
@ -36,7 +36,7 @@
 ## GPL.
 ## 
 ##
-## $Id: Makefile,v 1.11 2000/09/30 01:46:30 nelsonb%netscape.com Exp $
+## $Id: Makefile,v 1.12 2000/12/02 02:37:22 nelsonb%netscape.com Exp $
 ##

 ## Define CC to be the C compiler you wish to use.  The GNU cc
@ -65,68 +65,76 @@ CFLAGS= -O $(MPICMN)
 #CFLAGS=-ansi -pedantic -Wall -O3 $(MPICMN)
 #CFLAGS=-ansi -pedantic -Wall -g -O2 -DMP_DEBUG=1 $(MPICMN)

+ifeq ($(TARGET),mipsIRIX)
 #IRIX
-#MPICMN += -DMP_MONT_USE_MP_MUL
-#MPICMN += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE
-#AS_OBJS = mpi_mips.o
+MPICMN += -DMP_MONT_USE_MP_MUL -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE
+AS_OBJS = mpi_mips.o
 #ASFLAGS = -O -OPT:Olimit=4000 -dollar -fullwarn -xansi -n32 -mips3 -exceptions
-#ASFLAGS = -O -OPT:Olimit=4000 -dollar -fullwarn -xansi -n32 -mips3 
+ASFLAGS = -O -OPT:Olimit=4000 -dollar -fullwarn -xansi -n32 -mips3 
 #CFLAGS=-ansi -n32 -O3 -fullwarn -woff 1429 -D_SGI_SOURCE $(MPICMN)
-#CFLAGS=-ansi -n32 -O2 -fullwarn -woff 1429 -D_SGI_SOURCE $(MPICMN)
+CFLAGS=-ansi -n32 -O2 -fullwarn -woff 1429 -D_SGI_SOURCE $(MPICMN)
 #CFLAGS=-ansi -n32 -g -fullwarn -woff 1429 -D_SGI_SOURCE $(MPICMN)
 #CFLAGS=-ansi -n32 -g -fullwarn -woff 1429 -D_SGI_SOURCE -DMP_NO_MP_WORD \
 $(MPICMN)
 #CFLAGS=-ansi -64 -O2 -fullwarn -woff 1429 -D_SGI_SOURCE -DMP_NO_MP_WORD \
 $(MPICMN)
+endif

+ifeq ($(TARGET),alphaOSF1)
 #Alpha/OSF1
 #CFLAGS= -O -Olimit 4000 -ieee_with_inexact -std1 -DOSF1 -D_REENTRANT $(MPICMN)
 #CFLAGS= -O -Olimit 4000 -ieee_with_inexact -std1 -DOSF1 -D_REENTRANT \
 -DMP_NO_MP_WORD $(MPICMN)
+endif

+ifeq ($(TARGET),v9SOLARIS)
 #Solaris 64
-#SOLARIS_ASM_FLAGS = -fast -xO5 -xrestrict=%all -xdepend -xchip=ultra -xarch=v9a -KPIC -mt
-#AS_OBJS = mpi_sparc.o mpv_sparc.o
-#MPICMN += -DMP_USE_UINT_DIGIT -DMP_NO_MP_WORD -DMP_ASSEMBLY_MULTIPLY 
-#CFLAGS= -O -KPIC -DSVR4 -DSYSV -D__svr4 -D__svr4__ -DSOLARIS -D_REENTRANT \
+SOLARIS_ASM_FLAGS = -fast -xO5 -xrestrict=%all -xdepend -xchip=ultra -xarch=v9a -KPIC -mt
+AS_OBJS = montmulfv9.o mpi_sparc.o mpv_sparc.o
+MPICMN += -DMP_USE_UINT_DIGIT -DMP_NO_MP_WORD -DMP_ASSEMBLY_MULTIPLY 
+MPICMN += -DMP_USING_MONT_MULF
+CFLAGS= -O -KPIC -DSVR4 -DSYSV -D__svr4 -D__svr4__ -DSOLARIS -D_REENTRANT \
 -DSOLARIS2_8 -D_SVID_GETTOD -xarch=v9 -DXP_UNIX -DNSS_USE_64 $(MPICMN)
 #CFLAGS= -g -KPIC -DSVR4 -DSYSV -D__svr4 -D__svr4__ -DSOLARIS -D_REENTRANT \
 -DSOLARIS2_8 -D_SVID_GETTOD -xarch=v9 -DXP_UNIX -DNSS_USE_64 $(MPICMN)
+endif

+ifeq ($(TARGET),v8SOLARIS)
 #Solaris 32
-#CFLAGS=-O -KPIC -DSVR4 -DSYSV -D__svr4 -D__svr4__ -DSOLARIS -D_REENTRANT \
+SOLARIS_ASM_FLAGS = -xchip=ultra -xarch=v8plusa -KPIC -mt 
+AS_OBJS = montmulfv8.o mpi_sparc.o mpv_sparc32.o
+MPICMN += -DMP_ASSEMBLY_MULTIPLY 
+MPICMN += -DMP_USING_MONT_MULF
+CFLAGS=-O -KPIC -DSVR4 -DSYSV -D__svr4 -D__svr4__ -DSOLARIS -D_REENTRANT \
 -DSOLARIS2_6 -D_SVID_GETTOD -xarch=v8 -DXP_UNIX -DMP_NO_MP_WORD $(MPICMN)
 #CFLAGS=-O -KPIC -DSVR4 -DSYSV -D__svr4 -D__svr4__ -DSOLARIS -D_REENTRANT \
 -DSOLARIS2_6 -D_SVID_GETTOD -xarch=v8 -DXP_UNIX $(MPICMN)
-#SOLARIS_ASM_FLAGS = -xchip=ultra -xarch=v8plusa -KPIC -mt 
-#AS_OBJS = mpi_sparc.o mpv_sparc32.o
-#MPICMN += -DMP_ASSEMBLY_MULTIPLY 
-#CFLAGS=-O -KPIC -DSVR4 -DSYSV -D__svr4 -D__svr4__ -DSOLARIS -D_REENTRANT \
- -DSOLARIS2_6 -D_SVID_GETTOD -xarch=v8 -DXP_UNIX -DMP_NO_MP_WORD \
- -DMP_ASSEMBLY_MULTIPLY $(MPICMN)
+endif

+ifeq ($(TARGET),HPUX)
 #HPUX
 #CFLAGS= -O -DHPUX10 -D_POSIX_C_SOURCE=199506L -Ae +Z -DHPUX -Dhppa \
 -D_HPUX_SOURCE -Aa +e +DA2.0W +DS2.0 +DChpux -DHPUX11  -DXP_UNIX \
 -DNSS_USE_64 $(MPICMN)
 #CFLAGS= -O -DHPUX10 -D_POSIX_C_SOURCE=199506L -Ae +Z -DHPUX -Dhppa \
 -D_HPUX_SOURCE +DAportable +DS1.1 -DHPUX11 -DXP_UNIX -DMP_NO_MP_WORD $(MPICMN)
+endif

+ifeq ($(TARGET),x86LINUX)
 #Linux
-#AS_OBJS = mpi_x86.o
-#MPICMN += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE -DMP_ASSEMBLY_DIV_2DX1D
-#MPICMN += -DMP_MONT_USE_MP_MUL
-#CFLAGS= -O2 -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
+AS_OBJS = mpi_x86.o
+MPICMN += -DMP_ASSEMBLY_MULTIPLY -DMP_ASSEMBLY_SQUARE -DMP_ASSEMBLY_DIV_2DX1D
+MPICMN += -DMP_MONT_USE_MP_MUL
+CFLAGS= -O2 -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
 -pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \
 -DXP_UNIX -UDEBUG -DNDEBUG -D_REENTRANT $(MPICMN)
-
 #CFLAGS= -g -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
 -pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \
 -DXP_UNIX -DDEBUG -UNDEBUG -D_REENTRANT $(MPICMN)
-
 #CFLAGS= -g -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
 -pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \
 -DXP_UNIX -UDEBUG -DNDEBUG -D_REENTRANT $(MPICMN)
+endif

 ##
 ## Define LIBS to include any libraries you need to link against.
@ -176,7 +184,7 @@ DOCS=README doc utils/README utils/PRIMES

 ## This is the list of tools built by 'make tools'
 TOOLS=gcd invmod isprime lap dec2hex hex2dec primegen prng \
-	basecvt fact exptmod pi makeprime
+	basecvt fact exptmod pi makeprime identest

 LIBOBJS = mpprime.o mpmontg.o mplogic.o mpi.o $(AS_OBJS)
 LIBHDRS = mpi-config.h mpi-priv.h mpi.h
@ -221,12 +229,22 @@ mpprime.o: mpprime.c mpi-priv.h mpprime.h mplogic.h primes.c $(LIBHDRS)
 mpi_mips.o: mpi_mips.s
 	$(CC) -o $@ $(ASFLAGS) -c mpi_mips.s

+mpi_sparc.o : montmulf.h
+
 mpv_sparc32.o: mpv_sparc32.S
 	$(CC) -o $@ $(SOLARIS_ASM_FLAGS) -c mpv_sparc32.S

 mpv_sparc.o: vis_64.il mpv_sparc.c
 	$(CC) -o $@ $(SOLARIS_ASM_FLAGS) -c vis_64.il mpv_sparc.c

+montmulfv8.o montmulfv9.o : %.o : %.s 
+	$(CC) -o $@ $(SOLARIS_ASM_FLAGS) -c $<
+
+# This rule is used to build the .s sources, which are then hand optimized.
+#montmulfv8.s montmulfv9.s : montmulf%.s : montmulf%.il montmulf.c montmulf.h 
+#	$(CC) -o $@ $(SOLARIS_ASM_FLAGS) -S montmulf$*.il montmulf.c
+
+
 libmpi.a: $(LIBOBJS)
 	ar -cvr libmpi.a $(LIBOBJS)
 	$(RANLIB) libmpi.a
@ -311,6 +329,9 @@ clean:
 	rm -f utils/core
 	rm -f utils/*~ utils/.*~

+clobber: clean
+	rm -f $(TOOLS) $(UTILS)
+
 distclean: clean
 	rm -f mptest? mpi-test metime mulsqr karatsuba
 	rm -f mptest?a mptest?b
--- a/security/nss/lib/freebl/mpi/mpmontg.c
+++ b/security/nss/lib/freebl/mpi/mpmontg.c
@ -29,7 +29,7 @@
 * the GPL.  If you do not delete the provisions above, a recipient
 * may use your version of this file under either the MPL or the
 * GPL.
- *  $Id: mpmontg.c,v 1.8 2000/09/14 00:30:51 nelsonb%netscape.com Exp $
+ *  $Id: mpmontg.c,v 1.9 2000/12/02 02:37:22 nelsonb%netscape.com Exp $
 */

 /* This file implements moduluar exponentiation using Montgomery's
@ -41,10 +41,14 @@
 * published by Springer Verlag.
 */

+/* #define MP_USING_MONT_MULF 1 */
 #include <string.h>
 #include "mpi-priv.h"
 #include "mplogic.h"
 #include "mpprime.h"
+#ifdef MP_USING_MONT_MULF
+#include "montmulf.h"
+#endif

 #define STATIC
 /* #define DEBUG 1  */
@ -192,9 +196,21 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
  mp_size bits_in_exponent;
  mp_size i;
  mp_size window_bits, odd_ints;
-  mp_err res;
-  mp_int square, accum1, accum2, goodBase;
+  mp_err  res;
+  int     expOff, nLen;
+  mp_int  square, accum1, accum2, goodBase;
  mp_mont_modulus mmm;
+#ifdef MP_USING_MONT_MULF
+  int      dSize = 0, oddPowSize, dTmpSize, dSqrSize;
+  double   dn0;
+  double   *dBuf = 0; 
+  double   *dm1, *dn, *dSqr, *d16Tmp, *oddPowers[MAX_ODD_INTS], *dTmp;
+  mp_digit *mResult;
+#else
+  /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */
+  /* oddPowers[i] = base ** (2*i + 1); */
+  mp_int power2, oddPowers[MAX_ODD_INTS];
+#endif

  /* function for computing n0prime only works if n0 is odd */
  if (!mp_isodd(modulus))
@ -204,6 +220,10 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
  MP_DIGITS(&accum1) = 0;
  MP_DIGITS(&accum2) = 0;
  MP_DIGITS(&goodBase) = 0;
+#ifdef MP_USING_MONT_MULF
+  for (i = 0; i < MAX_ODD_INTS; ++i)
+    oddPowers[i] = 0;
+#endif

  if (mp_cmp(inBase, modulus) < 0) {
    base = inBase;
@ -213,10 +233,12 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
    MP_CHECKOK( mp_mod(inBase, modulus, &goodBase) );
  }

-  MP_CHECKOK( mp_init_size(&square, 2 * MP_USED(modulus) + 2) );
-  MP_CHECKOK( mp_init_size(&accum1, 3 * MP_USED(modulus) + 2) );
-  MP_CHECKOK( mp_init_size(&accum2, 3 * MP_USED(modulus) + 2) );
-
+  nLen  = MP_USED(modulus);
+  MP_CHECKOK( mp_init_size(&square, 2 * nLen + 2) );
+  MP_CHECKOK( mp_init_size(&accum1, 3 * nLen + 2) );
+#ifndef MP_USING_MONT_MULF
+  MP_CHECKOK( mp_init_size(&accum2, 3 * nLen + 2) );
+#endif
  mmm.N = *modulus;			/* a copy of the mp_int struct */
  i = mpl_significant_bits(modulus);
  i += MP_DIGIT_BIT - 1;
@ -228,7 +250,12 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
  mmm.n0prime = 0 - s_mp_invmod_radix( MP_DIGIT(modulus, 0) );

  MP_CHECKOK( s_mp_to_mont(base, &mmm, &square) );
-
+#ifdef MP_USING_MONT_MULF
+  MP_CHECKOK( s_mp_pad(&square, nLen) );
+  mp_set(&accum1, 1);
+  MP_CHECKOK( s_mp_to_mont(&accum1, &mmm, &accum1) );
+  MP_CHECKOK( s_mp_pad(&accum1, nLen) );
+#endif
  bits_in_exponent = mpl_significant_bits(exponent);
  if (bits_in_exponent > 480)
    window_bits = 6;
@ -242,19 +269,66 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
    bits_in_exponent += window_bits - i;
  } 
  {
-    /* oddPowers[i] = base ** (2*i + 1); */
-    int expOff;
-    /* power2 = base ** 2; oddPowers[i] = base ** (2*i + 1); */
-    mp_int power2, oddPowers[MAX_ODD_INTS];
+#ifdef MP_USING_MONT_MULF
+    oddPowSize = 2 * nLen + 1;
+    dTmpSize   = 2 * oddPowSize;
+    dSize = sizeof(double) * (nLen * 4 + 1 + 
+			      ((odd_ints + 1) * oddPowSize) + dTmpSize);
+    dBuf   = (double *)malloc(dSize);
+    dm1    = dBuf;		/* array of d32 */
+    dn     = dBuf   + nLen;	/* array of d32 */
+    dSqr   = dn     + nLen;    	/* array of d32 */
+    d16Tmp = dSqr   + nLen;	/* array of d16 */
+    dTmp   = d16Tmp + oddPowSize;
+
+    for (i = 0; i < odd_ints; ++i) {
+	oddPowers[i] = dTmp;
+	dTmp += oddPowSize;
+    }
+    mResult = (mp_digit *)(dTmp + dTmpSize);	/* size is nLen + 1 */
+
+    /* Make dn and dn0 */
+    conv_i32_to_d32(dn, MP_DIGITS(modulus), nLen);
+    dn0 = (double)(mmm.n0prime & 0xffff);
+
+    /* Make dSqr */
+    conv_i32_to_d32_and_d16(dm1, oddPowers[0], MP_DIGITS(&square), nLen);
+    mont_mulf_noconv(mResult, dm1, oddPowers[0], 
+		     dTmp, dn, MP_DIGITS(modulus), nLen, dn0);
+    conv_i32_to_d32(dSqr, mResult, nLen);
+
+    for (i = 1; i < odd_ints; ++i) {
+      mont_mulf_noconv(mResult, dSqr, oddPowers[i - 1], 
+		       dTmp, dn, MP_DIGITS(modulus), nLen, dn0);
+      conv_i32_to_d16(oddPowers[i], mResult, nLen);
+    }
+
+    s_mp_copy(MP_DIGITS(&accum1), mResult, nLen);
+
+#define SWAPPA 
+
+/* computes montgomery square of the integer in mResult */
+#define SQR(a,b) \
+    conv_i32_to_d32_and_d16(dm1, d16Tmp, mResult, nLen); \
+    mont_mulf_noconv(mResult, dm1, d16Tmp, \
+		     dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
+
+/* computes montgomery product of x and the integer in mResult */
+#define MUL(x,a,b) \
+    conv_i32_to_d32(dm1, mResult, nLen); \
+    mont_mulf_noconv(mResult, dm1, oddPowers[x], \
+		     dTmp, dn, MP_DIGITS(modulus), nLen, dn0)
+
+#else

    MP_CHECKOK( mp_init_copy(oddPowers, &square) );

-    mp_init_size(&power2, MP_USED(modulus) + 2 * MP_USED(&square) + 2);
+    mp_init_size(&power2, nLen + 2 * MP_USED(&square) + 2);
    MP_CHECKOK( mp_sqr(&square, &power2) );	/* square = square ** 2 */
    MP_CHECKOK( s_mp_redc(&power2, &mmm) );

    for (i = 1; i < odd_ints; ++i) {
-      mp_init_size(oddPowers + i, MP_USED(modulus) + 2 * MP_USED(&power2) + 2);
+      mp_init_size(oddPowers + i, nLen + 2 * MP_USED(&power2) + 2);
      MP_CHECKOK( mp_mul(oddPowers + (i - 1), &power2, oddPowers + i) );
      MP_CHECKOK( s_mp_redc(oddPowers + i, &mmm) );
    }
@ -277,6 +351,7 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
 #endif

 #define SWAPPA ptmp = pa1; pa1 = pa2; pa2 = ptmp
+#endif

    for (expOff = bits_in_exponent - window_bits; expOff >= 0; expOff -= window_bits) {
      mp_size smallExp;
@ -354,18 +429,31 @@ mp_err mp_exptmod(const mp_int *inBase, const mp_int *exponent,
      }
    }

-    mp_clear(&power2);
-    for (i = 0; i < odd_ints; ++i) {
-      mp_clear(oddPowers + i);
-    }
+#ifdef MP_USING_MONT_MULF
+    s_mp_copy(mResult, MP_DIGITS(&square), nLen);
+    pa1 = &square;
+#endif
  }
  res = s_mp_redc(pa1, &mmm);
  mp_exch(pa1, result);
+
 CLEANUP:
  mp_clear(&square);
  mp_clear(&accum1);
-  mp_clear(&accum2);
  mp_clear(&goodBase);
+#ifdef MP_USING_MONT_MULF
+  if (dBuf) {
+    if (dSize)
+      memset(dBuf, 0, dSize);
+    free(dBuf);
+  }
+#else
+  mp_clear(&power2);
+  for (i = 0; i < odd_ints; ++i) {
+    mp_clear(oddPowers + i);
+  }
+  mp_clear(&accum2);
+#endif
  /* Don't mp_clear mmm.N because it is merely a copy of modulus.
  ** Just zap it.
  */