Fix for bug 272327 . AMD64 assembly optimization for bignum multiply. r=nelson

2005-02-25 04:30:11 +00:00 · 2005-02-25 04:30:11 +00:00 · 0126f05e21
--- a/security/nss/lib/freebl/Makefile
+++ b/security/nss/lib/freebl/Makefile
@ -129,9 +129,10 @@ endif

 ifeq ($(OS_TARGET),Linux)
 ifeq ($(CPU_ARCH),x86_64)
-    ASFILES  = arcfour-amd64-gas.s
-    ASFLAGS += -march=opteron -m64
-    DEFINES += -DNSS_BEVAND_ARCFOUR 
+    ASFILES  = arcfour-amd64-gas.s mpi_amd64_gas.s
+    ASFLAGS += -march=opteron -m64 -fPIC
+    DEFINES += -DNSS_BEVAND_ARCFOUR -DMPI_AMD64 -DMP_ASSEMBLY_MULTIPLY
+    MPI_SRCS += mpi_amd64.c
 endif
 ifeq ($(CPU_ARCH),x86)
    ASFILES  = mpi_x86.s
@ -246,13 +247,14 @@ else
    ifeq ($(USE_64),1)
 	# Solaris for AMD64
 	ifdef NS_USE_GCC
-	    ASFILES  = arcfour-amd64-gas.s
-	    ASFLAGS += -march=opteron -m64
+	    ASFILES  = arcfour-amd64-gas.s mpi_amd64_gas.s
+	    ASFLAGS += -march=opteron -m64 -fPIC
 	else
-	    ASFILES  = arcfour-amd64-sun.s
+	    ASFILES  = arcfour-amd64-sun.s mpi_amd64_sun.s
 	    ASFLAGS += -xarch=generic64 -K PIC
 	endif
-	DEFINES += -DNSS_BEVAND_ARCFOUR 
+	DEFINES += -DNSS_BEVAND_ARCFOUR -DMPI_AMD64 -DMP_ASSEMBLY_MULTIPLY
+	MPI_SRCS += mpi_amd64.c
    else
 	# Solaris x86
 	DEFINES += -D_X86_
--- a/security/nss/lib/freebl/mpi/mpi-priv.h
+++ b/security/nss/lib/freebl/mpi/mpi-priv.h
@ -42,7 +42,7 @@
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */
-/* $Id: mpi-priv.h,v 1.17 2004-04-27 23:04:36 gerv%gerv.net Exp $ */
+/* $Id: mpi-priv.h,v 1.18 2005-02-25 04:30:11 julien.pierre.bugs%sun.com Exp $ */
 #ifndef _MPI_PRIV_H_
 #define _MPI_PRIV_H_ 1

@ -238,10 +238,28 @@ mp_err   s_mp_invmod_even_m(const mp_int *a, const mp_int *m, mp_int *c);
 #define MPI_ASM_DECL
 #endif

+#ifdef MPI_AMD64
+
+mp_digit MPI_ASM_DECL s_mpv_mul_set_vec64(mp_digit*, mp_digit *, mp_size, mp_digit);
+mp_digit MPI_ASM_DECL s_mpv_mul_add_vec64(mp_digit*, const mp_digit*, mp_size, mp_digit);
+
+/* c = a * b */
+#define s_mpv_mul_d(a, a_len, b, c) \
+	((unsigned long*)c)[a_len] = s_mpv_mul_set_vec64(c, a, a_len, b)
+
+/* c += a * b */
+#define s_mpv_mul_d_add(a, a_len, b, c) \
+	((unsigned long*)c)[a_len] = s_mpv_mul_add_vec64(c, a, a_len, b)
+
+#else
+
 void     MPI_ASM_DECL s_mpv_mul_d(const mp_digit *a, mp_size a_len,
                                        mp_digit b, mp_digit *c);
 void     MPI_ASM_DECL s_mpv_mul_d_add(const mp_digit *a, mp_size a_len,
                                            mp_digit b, mp_digit *c);
+
+#endif
+
 void     MPI_ASM_DECL s_mpv_mul_d_add_prop(const mp_digit *a,
                                                mp_size a_len, mp_digit b, 
 			                        mp_digit *c);
--- a/security/nss/lib/freebl/mpi/mpi-test.c
+++ b/security/nss/lib/freebl/mpi/mpi-test.c
@ -42,7 +42,7 @@
 * the terms of any one of the MPL, the GPL or the LGPL.
 *
 * ***** END LICENSE BLOCK ***** */
-/* $Id: mpi-test.c,v 1.12 2004-04-27 23:04:36 gerv%gerv.net Exp $ */
+/* $Id: mpi-test.c,v 1.13 2005-02-25 04:30:11 julien.pierre.bugs%sun.com Exp $ */

 #include <stdio.h>
 #include <stdlib.h>
@ -134,6 +134,7 @@ const char *mp19 =
 "F1C9DACDA287F2E3C88DCE2393B8F53DAAAC1196DC36510962B6B59454CFE64B";
 const char *mp20 = 
 "D445662C8B6FE394107B867797750C326E0F4A967E135FC430F6CD7207913AC7";
+const char* mp21 = "2";

 const mp_digit md1 = 0;
 const mp_digit md2 = 0x1;
@ -201,7 +202,7 @@ const char *p_mp1415 =
 "548F1732452F9E7F810C0B4B430C073C0FBCE03F0D03F82630654BCE166AA772E1EE"
 "DD0C08D3E3EBDF0AF54203B43AFDFC40D8FC79C97A4B0A4E1BEB14D8FCEFDDED8758"
 "6ED65B18";
-
+const char *p_mp2121 = "4";
 const char *mp_mp345 = "B9B6D3A3";
 const char *mp_mp335 = "16609C2D";

@ -874,6 +875,15 @@ int test_mul(void)
    reason("error: computed %s, expected %s\n", g_intbuf, p_mp1415);
    res = 1;
  }
+  mp_read_radix(&a, mp21, 10); mp_read_radix(&b, mp21, 10);
+
+  IFOK( mp_mul(&a, &b, &a) );
+  mp_toradix(&a, g_intbuf, 10);
+
+  if(strcmp(g_intbuf, p_mp2121) != 0) {
+    reason("error: computed %s, expected %s\n", g_intbuf, p_mp2121);
+    res = 1; goto CLEANUP;
+  }

 CLEANUP:
  mp_clear(&a); mp_clear(&b);
--- a/security/nss/lib/freebl/mpi/mpi_amd64.c
+++ b/security/nss/lib/freebl/mpi/mpi_amd64.c
@ -0,0 +1,65 @@
+/* ***** BEGIN LICENSE BLOCK *****
+ * Version: MPL 1.1/GPL 2.0/LGPL 2.1
+ *
+ * The contents of this file are subject to the Mozilla Public License Version
+ * 1.1 (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ * http://www.mozilla.org/MPL/
+ *
+ * Software distributed under the License is distributed on an "AS IS" basis,
+ * WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+ * for the specific language governing rights and limitations under the
+ * License.
+ *
+ * The Original Code is the Solaris software cryptographic token.
+ *
+ * The Initial Developer of the Original Code is
+ * Sun Microsystems, Inc.
+ * Portions created by the Initial Developer are Copyright (C) 2005
+ * the Initial Developer. All Rights Reserved.
+ *
+ * Contributor(s):
+ *   Sun Microsystems, Inc.
+ *
+ * Alternatively, the contents of this file may be used under the terms of
+ * either the GNU General Public License Version 2 or later (the "GPL"), or
+ * the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+ * in which case the provisions of the GPL or the LGPL are applicable instead
+ * of those above. If you wish to allow use of your version of this file only
+ * under the terms of either the GPL or the LGPL, and not to allow others to
+ * use your version of this file under the terms of the MPL, indicate your
+ * decision by deleting the provisions above and replace them with the notice
+ * and other provisions required by the GPL or the LGPL. If you do not delete
+ * the provisions above, a recipient may use your version of this file under
+ * the terms of any one of the MPL, the GPL or the LGPL.
+ *
+ * ***** END LICENSE BLOCK ***** */
+
+#ifndef MPI_AMD64
+#error This file only works on AMD64 platforms.
+#endif
+
+#include <mpi-priv.h>
+
+/*
+ * MPI glue
+ *
+ */
+
+/* Presently, this is only used by the Montgomery arithmetic code. */
+/* c += a * b */
+void MPI_ASM_DECL s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len,
+                                       mp_digit b, mp_digit *c)
+{
+  mp_digit w;
+  mp_digit d;
+
+  d = s_mpv_mul_add_vec64(c, a, a_len, b);
+  c += a_len;
+  while (d) {
+    w = c[0] + d;
+    d = (w < c[0] || w < d);
+    *c++ = w;
+  }
+}
+
--- a/security/nss/lib/freebl/mpi/mpi_amd64_gas.s
+++ b/security/nss/lib/freebl/mpi/mpi_amd64_gas.s
@ -0,0 +1,418 @@
+# ***** BEGIN LICENSE BLOCK *****
+# Version: MPL 1.1/GPL 2.0/LGPL 2.1
+# 
+# The contents of this file are subject to the Mozilla Public License Version
+# 1.1 (the "License"); you may not use this file except in compliance with
+# the License. You may obtain a copy of the License at
+# http://www.mozilla.org/MPL/
+# 
+# Software distributed under the License is distributed on an "AS IS" basis,
+# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+# for the specific language governing rights and limitations under the
+# License.
+# 
+# The Original Code is the Solaris software cryptographic token.
+# 
+# The Initial Developer of the Original Code is
+# Sun Microsystems, Inc.
+# Portions created by the Initial Developer are Copyright (C) 2005
+# the Initial Developer. All Rights Reserved.
+# 
+# Contributor(s):
+#   Sun Microsystems, Inc.
+# 
+# Alternatively, the contents of this file may be used under the terms of
+# either the GNU General Public License Version 2 or later (the "GPL"), or
+# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+# in which case the provisions of the GPL or the LGPL are applicable instead
+# of those above. If you wish to allow use of your version of this file only
+# under the terms of either the GPL or the LGPL, and not to allow others to
+# use your version of this file under the terms of the MPL, indicate your
+# decision by deleting the provisions above and replace them with the notice
+# and other provisions required by the GPL or the LGPL. If you do not delete
+# the provisions above, a recipient may use your version of this file under
+# the terms of any one of the MPL, the GPL or the LGPL.
+# 
+# ***** END LICENSE BLOCK ***** */
+
+
+# ------------------------------------------------------------------------
+#
+#  Implementation of s_mpv_mul_set_vec which exploits
+#  the 64X64->128 bit  unsigned multiply instruction.
+#
+# ------------------------------------------------------------------------
+
+# r = a * digit, r and a are vectors of length len
+# returns the carry digit
+# r and a are 64 bit aligned.
+#
+# uint64_t
+# s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
+#
+
+.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
+
+	xorq	%rax, %rax		# if (len == 0) return (0)
+	testq	%rdx, %rdx
+	jz	.L17
+
+	movq	%rdx, %r8		# Use r8 for len; %rdx is used by mul
+	xorq	%r9, %r9		# cy = 0
+
+.L15:
+	cmpq	$8, %r8			# 8 - len
+	jb	.L16
+	movq	0(%rsi), %rax		# rax = a[0]
+	movq	8(%rsi), %r11		# prefetch a[1]
+	mulq	%rcx			# p = a[0] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 0(%rdi)		# r[0] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	16(%rsi), %r11		# prefetch a[2]
+	mulq	%rcx			# p = a[1] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 8(%rdi)		# r[1] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	24(%rsi), %r11		# prefetch a[3]
+	mulq	%rcx			# p = a[2] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 16(%rdi)		# r[2] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	32(%rsi), %r11		# prefetch a[4]
+	mulq	%rcx			# p = a[3] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 24(%rdi)		# r[3] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	40(%rsi), %r11		# prefetch a[5]
+	mulq	%rcx			# p = a[4] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 32(%rdi)		# r[4] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	48(%rsi), %r11		# prefetch a[6]
+	mulq	%rcx			# p = a[5] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 40(%rdi)		# r[5] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	56(%rsi), %r11		# prefetch a[7]
+	mulq	%rcx			# p = a[6] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 48(%rdi)		# r[6] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	mulq	%rcx			# p = a[7] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 56(%rdi)		# r[7] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	addq	$64, %rsi
+	addq	$64, %rdi
+	subq	$8, %r8
+
+	jz	.L17
+	jmp	.L15
+
+.L16:
+	movq	0(%rsi), %rax
+	mulq	%rcx			# p = a[0] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 0(%rdi)		# r[0] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	8(%rsi), %rax
+	mulq	%rcx			# p = a[1] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 8(%rdi)		# r[1] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	16(%rsi), %rax
+	mulq	%rcx			# p = a[2] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 16(%rdi)		# r[2] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	24(%rsi), %rax
+	mulq	%rcx			# p = a[3] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 24(%rdi)		# r[3] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	32(%rsi), %rax
+	mulq	%rcx			# p = a[4] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 32(%rdi)		# r[4] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	40(%rsi), %rax
+	mulq	%rcx			# p = a[5] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 40(%rdi)		# r[5] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	48(%rsi), %rax
+	mulq	%rcx			# p = a[6] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 48(%rdi)		# r[6] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+
+.L17:
+	movq	%r9, %rax
+	ret
+
+.size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
+
+# ------------------------------------------------------------------------
+#
+#  Implementation of s_mpv_mul_add_vec which exploits
+#  the 64X64->128 bit  unsigned multiply instruction.
+#
+# ------------------------------------------------------------------------
+
+# r += a * digit, r and a are vectors of length len
+# returns the carry digit
+# r and a are 64 bit aligned.
+#
+# uint64_t
+# s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
+#
+
+.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
+
+	xorq	%rax, %rax		# if (len == 0) return (0)
+	testq	%rdx, %rdx
+	jz	.L27
+
+	movq	%rdx, %r8		# Use r8 for len; %rdx is used by mul
+	xorq	%r9, %r9		# cy = 0
+
+.L25:
+	cmpq	$8, %r8			# 8 - len
+	jb	.L26
+	movq	0(%rsi), %rax		# rax = a[0]
+	movq	0(%rdi), %r10		# r10 = r[0]
+	movq	8(%rsi), %r11		# prefetch a[1]
+	mulq	%rcx			# p = a[0] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[0]
+	movq	8(%rdi), %r10		# prefetch r[1]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 0(%rdi)		# r[0] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	16(%rsi), %r11		# prefetch a[2]
+	mulq	%rcx			# p = a[1] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[1]
+	movq	16(%rdi), %r10		# prefetch r[2]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 8(%rdi)		# r[1] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	24(%rsi), %r11		# prefetch a[3]
+	mulq	%rcx			# p = a[2] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[2]
+	movq	24(%rdi), %r10		# prefetch r[3]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 16(%rdi)		# r[2] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	32(%rsi), %r11		# prefetch a[4]
+	mulq	%rcx			# p = a[3] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[3]
+	movq	32(%rdi), %r10		# prefetch r[4]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 24(%rdi)		# r[3] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	40(%rsi), %r11		# prefetch a[5]
+	mulq	%rcx			# p = a[4] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[4]
+	movq	40(%rdi), %r10		# prefetch r[5]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 32(%rdi)		# r[4] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	48(%rsi), %r11		# prefetch a[6]
+	mulq	%rcx			# p = a[5] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[5]
+	movq	48(%rdi), %r10		# prefetch r[6]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 40(%rdi)		# r[5] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	movq	56(%rsi), %r11		# prefetch a[7]
+	mulq	%rcx			# p = a[6] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[6]
+	movq	56(%rdi), %r10		# prefetch r[7]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 48(%rdi)		# r[6] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	movq	%r11, %rax
+	mulq	%rcx			# p = a[7] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[7]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 56(%rdi)		# r[7] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+
+	addq	$64, %rsi
+	addq	$64, %rdi
+	subq	$8, %r8
+
+	jz	.L27
+	jmp	.L25
+
+.L26:
+	movq	0(%rsi), %rax
+	movq	0(%rdi), %r10
+	mulq	%rcx			# p = a[0] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[0]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 0(%rdi)		# r[0] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	8(%rsi), %rax
+	movq	8(%rdi), %r10
+	mulq	%rcx			# p = a[1] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[1]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 8(%rdi)		# r[1] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	16(%rsi), %rax
+	movq	16(%rdi), %r10
+	mulq	%rcx			# p = a[2] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[2]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 16(%rdi)		# r[2] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	24(%rsi), %rax
+	movq	24(%rdi), %r10
+	mulq	%rcx			# p = a[3] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[3]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 24(%rdi)		# r[3] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	32(%rsi), %rax
+	movq	32(%rdi), %r10
+	mulq	%rcx			# p = a[4] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[4]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 32(%rdi)		# r[4] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	40(%rsi), %rax
+	movq	40(%rdi), %r10
+	mulq	%rcx			# p = a[5] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[5]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 40(%rdi)		# r[5] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	48(%rsi), %rax
+	movq	48(%rdi), %r10
+	mulq	%rcx			# p = a[6] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		# p += r[6]
+	addq	%r9, %rax
+	adcq	$0, %rdx		# p += cy
+	movq	%rax, 48(%rdi)		# r[6] = lo(p)
+	movq	%rdx, %r9		# cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+
+.L27:
+	movq	%r9, %rax
+	ret
+        
+.size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]
--- a/security/nss/lib/freebl/mpi/mpi_amd64_sun.s
+++ b/security/nss/lib/freebl/mpi/mpi_amd64_sun.s
@ -0,0 +1,418 @@
+/ ***** BEGIN LICENSE BLOCK *****
+/ Version: MPL 1.1/GPL 2.0/LGPL 2.1
+/ 
+/ The contents of this file are subject to the Mozilla Public License Version
+/ 1.1 (the "License"); you may not use this file except in compliance with
+/ the License. You may obtain a copy of the License at
+/ http://www.mozilla.org/MPL/
+/ 
+/ Software distributed under the License is distributed on an "AS IS" basis,
+/ WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
+/ for the specific language governing rights and limitations under the
+/ License.
+/ 
+/ The Original Code is the Solaris software cryptographic token.
+/ 
+/ The Initial Developer of the Original Code is
+/ Sun Microsystems, Inc.
+/ Portions created by the Initial Developer are Copyright (C) 2005
+/ the Initial Developer. All Rights Reserved.
+/ 
+/ Contributor(s):
+/   Sun Microsystems, Inc.
+/ 
+/ Alternatively, the contents of this file may be used under the terms of
+/ either the GNU General Public License Version 2 or later (the "GPL"), or
+/ the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
+/ in which case the provisions of the GPL or the LGPL are applicable instead
+/ of those above. If you wish to allow use of your version of this file only
+/ under the terms of either the GPL or the LGPL, and not to allow others to
+/ use your version of this file under the terms of the MPL, indicate your
+/ decision by deleting the provisions above and replace them with the notice
+/ and other provisions required by the GPL or the LGPL. If you do not delete
+/ the provisions above, a recipient may use your version of this file under
+/ the terms of any one of the MPL, the GPL or the LGPL.
+/ 
+/ ***** END LICENSE BLOCK ***** */
+
+
+/ ------------------------------------------------------------------------
+/
+/  Implementation of s_mpv_mul_set_vec which exploits
+/  the 64X64->128 bit  unsigned multiply instruction.
+/
+/ ------------------------------------------------------------------------
+
+/ r = a * digit, r and a are vectors of length len
+/ returns the carry digit
+/ r and a are 64 bit aligned.
+/
+/ uint64_t
+/ s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
+/
+
+.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
+
+	xorq	%rax, %rax		/ if (len == 0) return (0)
+	testq	%rdx, %rdx
+	jz	.L17
+
+	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
+	xorq	%r9, %r9		/ cy = 0
+
+.L15:
+	cmpq	$8, %r8			/ 8 - len
+	jb	.L16
+	movq	0(%rsi), %rax		/ rax = a[0]
+	movq	8(%rsi), %r11		/ prefetch a[1]
+	mulq	%rcx			/ p = a[0] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	16(%rsi), %r11		/ prefetch a[2]
+	mulq	%rcx			/ p = a[1] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	24(%rsi), %r11		/ prefetch a[3]
+	mulq	%rcx			/ p = a[2] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	32(%rsi), %r11		/ prefetch a[4]
+	mulq	%rcx			/ p = a[3] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	40(%rsi), %r11		/ prefetch a[5]
+	mulq	%rcx			/ p = a[4] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	48(%rsi), %r11		/ prefetch a[6]
+	mulq	%rcx			/ p = a[5] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	56(%rsi), %r11		/ prefetch a[7]
+	mulq	%rcx			/ p = a[6] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	mulq	%rcx			/ p = a[7] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	addq	$64, %rsi
+	addq	$64, %rdi
+	subq	$8, %r8
+
+	jz	.L17
+	jmp	.L15
+
+.L16:
+	movq	0(%rsi), %rax
+	mulq	%rcx			/ p = a[0] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	8(%rsi), %rax
+	mulq	%rcx			/ p = a[1] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	16(%rsi), %rax
+	mulq	%rcx			/ p = a[2] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	24(%rsi), %rax
+	mulq	%rcx			/ p = a[3] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	32(%rsi), %rax
+	mulq	%rcx			/ p = a[4] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	40(%rsi), %rax
+	mulq	%rcx			/ p = a[5] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+	movq	48(%rsi), %rax
+	mulq	%rcx			/ p = a[6] * digit
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L17
+
+
+.L17:
+	movq	%r9, %rax
+	ret
+
+.size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
+
+/ ------------------------------------------------------------------------
+/
+/  Implementation of s_mpv_mul_add_vec which exploits
+/  the 64X64->128 bit  unsigned multiply instruction.
+/
+/ ------------------------------------------------------------------------
+
+/ r += a * digit, r and a are vectors of length len
+/ returns the carry digit
+/ r and a are 64 bit aligned.
+/
+/ uint64_t
+/ s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
+/
+
+.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
+
+	xorq	%rax, %rax		/ if (len == 0) return (0)
+	testq	%rdx, %rdx
+	jz	.L27
+
+	movq	%rdx, %r8		/ Use r8 for len; %rdx is used by mul
+	xorq	%r9, %r9		/ cy = 0
+
+.L25:
+	cmpq	$8, %r8			/ 8 - len
+	jb	.L26
+	movq	0(%rsi), %rax		/ rax = a[0]
+	movq	0(%rdi), %r10		/ r10 = r[0]
+	movq	8(%rsi), %r11		/ prefetch a[1]
+	mulq	%rcx			/ p = a[0] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[0]
+	movq	8(%rdi), %r10		/ prefetch r[1]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	16(%rsi), %r11		/ prefetch a[2]
+	mulq	%rcx			/ p = a[1] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[1]
+	movq	16(%rdi), %r10		/ prefetch r[2]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	24(%rsi), %r11		/ prefetch a[3]
+	mulq	%rcx			/ p = a[2] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[2]
+	movq	24(%rdi), %r10		/ prefetch r[3]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	32(%rsi), %r11		/ prefetch a[4]
+	mulq	%rcx			/ p = a[3] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[3]
+	movq	32(%rdi), %r10		/ prefetch r[4]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	40(%rsi), %r11		/ prefetch a[5]
+	mulq	%rcx			/ p = a[4] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[4]
+	movq	40(%rdi), %r10		/ prefetch r[5]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	48(%rsi), %r11		/ prefetch a[6]
+	mulq	%rcx			/ p = a[5] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[5]
+	movq	48(%rdi), %r10		/ prefetch r[6]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	movq	56(%rsi), %r11		/ prefetch a[7]
+	mulq	%rcx			/ p = a[6] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[6]
+	movq	56(%rdi), %r10		/ prefetch r[7]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	movq	%r11, %rax
+	mulq	%rcx			/ p = a[7] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[7]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 56(%rdi)		/ r[7] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+
+	addq	$64, %rsi
+	addq	$64, %rdi
+	subq	$8, %r8
+
+	jz	.L27
+	jmp	.L25
+
+.L26:
+	movq	0(%rsi), %rax
+	movq	0(%rdi), %r10
+	mulq	%rcx			/ p = a[0] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[0]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 0(%rdi)		/ r[0] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	8(%rsi), %rax
+	movq	8(%rdi), %r10
+	mulq	%rcx			/ p = a[1] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[1]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 8(%rdi)		/ r[1] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	16(%rsi), %rax
+	movq	16(%rdi), %r10
+	mulq	%rcx			/ p = a[2] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[2]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 16(%rdi)		/ r[2] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	24(%rsi), %rax
+	movq	24(%rdi), %r10
+	mulq	%rcx			/ p = a[3] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[3]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 24(%rdi)		/ r[3] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	32(%rsi), %rax
+	movq	32(%rdi), %r10
+	mulq	%rcx			/ p = a[4] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[4]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 32(%rdi)		/ r[4] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	40(%rsi), %rax
+	movq	40(%rdi), %r10
+	mulq	%rcx			/ p = a[5] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[5]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 40(%rdi)		/ r[5] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+	movq	48(%rsi), %rax
+	movq	48(%rdi), %r10
+	mulq	%rcx			/ p = a[6] * digit
+	addq	%r10, %rax
+	adcq	$0, %rdx		/ p += r[6]
+	addq	%r9, %rax
+	adcq	$0, %rdx		/ p += cy
+	movq	%rax, 48(%rdi)		/ r[6] = lo(p)
+	movq	%rdx, %r9		/ cy = hi(p)
+	decq	%r8
+	jz	.L27
+
+
+.L27:
+	movq	%r9, %rax
+	ret
+        
+.size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]
--- a/security/nss/lib/freebl/mpi/target.mk
+++ b/security/nss/lib/freebl/mpi/target.mk
@ -217,3 +217,14 @@ CFLAGS= -O2 -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
 -pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \
 -DXP_UNIX -UDEBUG -DNDEBUG -D_REENTRANT $(MPICMN)
 endif
+
+ifeq ($(TARGET),AMD64SOLARIS)
+ASFLAGS += -xarch=generic64
+AS_OBJS = mpi_amd64.o mpi_amd64_sun.o
+MP_CONFIG = -DMP_ASSEMBLY_MULTIPLY -DMPI_AMD64
+CFLAGS = -xarch=generic64 -xO4 -I. -DMP_API_COMPATIBLE -DMP_IOFUNC $(MP_CONFIG)
+MPICMN += $(MP_CONFIG)
+
+mpi_amd64_asm.o: mpi_amd64_sun.s
+	$(AS) -xarch=generic64 -P -D_ASM mpi_amd64_sun.s
+endif