Fix for bug 272327 . AMD64 assembly optimization for bignum multiply. r=nelson

This commit is contained in:
julien.pierre.bugs%sun.com 2005-02-25 04:30:11 +00:00
Родитель 9de196f20f
Коммит 0126f05e21
7 изменённых файлов: 952 добавлений и 10 удалений

Просмотреть файл

@ -129,9 +129,10 @@ endif
ifeq ($(OS_TARGET),Linux)
ifeq ($(CPU_ARCH),x86_64)
ASFILES = arcfour-amd64-gas.s
ASFLAGS += -march=opteron -m64
DEFINES += -DNSS_BEVAND_ARCFOUR
ASFILES = arcfour-amd64-gas.s mpi_amd64_gas.s
ASFLAGS += -march=opteron -m64 -fPIC
DEFINES += -DNSS_BEVAND_ARCFOUR -DMPI_AMD64 -DMP_ASSEMBLY_MULTIPLY
MPI_SRCS += mpi_amd64.c
endif
ifeq ($(CPU_ARCH),x86)
ASFILES = mpi_x86.s
@ -246,13 +247,14 @@ else
ifeq ($(USE_64),1)
# Solaris for AMD64
ifdef NS_USE_GCC
ASFILES = arcfour-amd64-gas.s
ASFLAGS += -march=opteron -m64
ASFILES = arcfour-amd64-gas.s mpi_amd64_gas.s
ASFLAGS += -march=opteron -m64 -fPIC
else
ASFILES = arcfour-amd64-sun.s
ASFILES = arcfour-amd64-sun.s mpi_amd64_sun.s
ASFLAGS += -xarch=generic64 -K PIC
endif
DEFINES += -DNSS_BEVAND_ARCFOUR
DEFINES += -DNSS_BEVAND_ARCFOUR -DMPI_AMD64 -DMP_ASSEMBLY_MULTIPLY
MPI_SRCS += mpi_amd64.c
else
# Solaris x86
DEFINES += -D_X86_

Просмотреть файл

@ -42,7 +42,7 @@
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/* $Id: mpi-priv.h,v 1.17 2004-04-27 23:04:36 gerv%gerv.net Exp $ */
/* $Id: mpi-priv.h,v 1.18 2005-02-25 04:30:11 julien.pierre.bugs%sun.com Exp $ */
#ifndef _MPI_PRIV_H_
#define _MPI_PRIV_H_ 1
@ -238,10 +238,28 @@ mp_err s_mp_invmod_even_m(const mp_int *a, const mp_int *m, mp_int *c);
#define MPI_ASM_DECL
#endif
#ifdef MPI_AMD64
mp_digit MPI_ASM_DECL s_mpv_mul_set_vec64(mp_digit*, mp_digit *, mp_size, mp_digit);
mp_digit MPI_ASM_DECL s_mpv_mul_add_vec64(mp_digit*, const mp_digit*, mp_size, mp_digit);
/* c = a * b */
#define s_mpv_mul_d(a, a_len, b, c) \
((unsigned long*)c)[a_len] = s_mpv_mul_set_vec64(c, a, a_len, b)
/* c += a * b */
#define s_mpv_mul_d_add(a, a_len, b, c) \
((unsigned long*)c)[a_len] = s_mpv_mul_add_vec64(c, a, a_len, b)
#else
void MPI_ASM_DECL s_mpv_mul_d(const mp_digit *a, mp_size a_len,
mp_digit b, mp_digit *c);
void MPI_ASM_DECL s_mpv_mul_d_add(const mp_digit *a, mp_size a_len,
mp_digit b, mp_digit *c);
#endif
void MPI_ASM_DECL s_mpv_mul_d_add_prop(const mp_digit *a,
mp_size a_len, mp_digit b,
mp_digit *c);

Просмотреть файл

@ -42,7 +42,7 @@
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
/* $Id: mpi-test.c,v 1.12 2004-04-27 23:04:36 gerv%gerv.net Exp $ */
/* $Id: mpi-test.c,v 1.13 2005-02-25 04:30:11 julien.pierre.bugs%sun.com Exp $ */
#include <stdio.h>
#include <stdlib.h>
@ -134,6 +134,7 @@ const char *mp19 =
"F1C9DACDA287F2E3C88DCE2393B8F53DAAAC1196DC36510962B6B59454CFE64B";
const char *mp20 =
"D445662C8B6FE394107B867797750C326E0F4A967E135FC430F6CD7207913AC7";
const char* mp21 = "2";
const mp_digit md1 = 0;
const mp_digit md2 = 0x1;
@ -201,7 +202,7 @@ const char *p_mp1415 =
"548F1732452F9E7F810C0B4B430C073C0FBCE03F0D03F82630654BCE166AA772E1EE"
"DD0C08D3E3EBDF0AF54203B43AFDFC40D8FC79C97A4B0A4E1BEB14D8FCEFDDED8758"
"6ED65B18";
const char *p_mp2121 = "4";
const char *mp_mp345 = "B9B6D3A3";
const char *mp_mp335 = "16609C2D";
@ -874,6 +875,15 @@ int test_mul(void)
reason("error: computed %s, expected %s\n", g_intbuf, p_mp1415);
res = 1;
}
mp_read_radix(&a, mp21, 10); mp_read_radix(&b, mp21, 10);
IFOK( mp_mul(&a, &b, &a) );
mp_toradix(&a, g_intbuf, 10);
if(strcmp(g_intbuf, p_mp2121) != 0) {
reason("error: computed %s, expected %s\n", g_intbuf, p_mp2121);
res = 1; goto CLEANUP;
}
CLEANUP:
mp_clear(&a); mp_clear(&b);

Просмотреть файл

@ -0,0 +1,65 @@
/* ***** BEGIN LICENSE BLOCK *****
* Version: MPL 1.1/GPL 2.0/LGPL 2.1
*
* The contents of this file are subject to the Mozilla Public License Version
* 1.1 (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* http://www.mozilla.org/MPL/
*
* Software distributed under the License is distributed on an "AS IS" basis,
* WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
* for the specific language governing rights and limitations under the
* License.
*
* The Original Code is the Solaris software cryptographic token.
*
* The Initial Developer of the Original Code is
* Sun Microsystems, Inc.
* Portions created by the Initial Developer are Copyright (C) 2005
* the Initial Developer. All Rights Reserved.
*
* Contributor(s):
* Sun Microsystems, Inc.
*
* Alternatively, the contents of this file may be used under the terms of
* either the GNU General Public License Version 2 or later (the "GPL"), or
* the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
* in which case the provisions of the GPL or the LGPL are applicable instead
* of those above. If you wish to allow use of your version of this file only
* under the terms of either the GPL or the LGPL, and not to allow others to
* use your version of this file under the terms of the MPL, indicate your
* decision by deleting the provisions above and replace them with the notice
* and other provisions required by the GPL or the LGPL. If you do not delete
* the provisions above, a recipient may use your version of this file under
* the terms of any one of the MPL, the GPL or the LGPL.
*
* ***** END LICENSE BLOCK ***** */
#ifndef MPI_AMD64
#error This file only works on AMD64 platforms.
#endif
#include <mpi-priv.h>
/*
* MPI glue
*
*/
/* Presently, this is only used by the Montgomery arithmetic code. */
/* c += a * b */
void MPI_ASM_DECL s_mpv_mul_d_add_prop(const mp_digit *a, mp_size a_len,
mp_digit b, mp_digit *c)
{
mp_digit w;
mp_digit d;
d = s_mpv_mul_add_vec64(c, a, a_len, b);
c += a_len;
while (d) {
w = c[0] + d;
d = (w < c[0] || w < d);
*c++ = w;
}
}

Просмотреть файл

@ -0,0 +1,418 @@
# ***** BEGIN LICENSE BLOCK *****
# Version: MPL 1.1/GPL 2.0/LGPL 2.1
#
# The contents of this file are subject to the Mozilla Public License Version
# 1.1 (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
# http://www.mozilla.org/MPL/
#
# Software distributed under the License is distributed on an "AS IS" basis,
# WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
# for the specific language governing rights and limitations under the
# License.
#
# The Original Code is the Solaris software cryptographic token.
#
# The Initial Developer of the Original Code is
# Sun Microsystems, Inc.
# Portions created by the Initial Developer are Copyright (C) 2005
# the Initial Developer. All Rights Reserved.
#
# Contributor(s):
# Sun Microsystems, Inc.
#
# Alternatively, the contents of this file may be used under the terms of
# either the GNU General Public License Version 2 or later (the "GPL"), or
# the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
# in which case the provisions of the GPL or the LGPL are applicable instead
# of those above. If you wish to allow use of your version of this file only
# under the terms of either the GPL or the LGPL, and not to allow others to
# use your version of this file under the terms of the MPL, indicate your
# decision by deleting the provisions above and replace them with the notice
# and other provisions required by the GPL or the LGPL. If you do not delete
# the provisions above, a recipient may use your version of this file under
# the terms of any one of the MPL, the GPL or the LGPL.
#
# ***** END LICENSE BLOCK ***** */
# ------------------------------------------------------------------------
#
# Implementation of s_mpv_mul_set_vec which exploits
# the 64X64->128 bit unsigned multiply instruction.
#
# ------------------------------------------------------------------------
# r = a * digit, r and a are vectors of length len
# returns the carry digit
# r and a are 64 bit aligned.
#
# uint64_t
# s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
#
.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
xorq %rax, %rax # if (len == 0) return (0)
testq %rdx, %rdx
jz .L17
movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
xorq %r9, %r9 # cy = 0
.L15:
cmpq $8, %r8 # 8 - len
jb .L16
movq 0(%rsi), %rax # rax = a[0]
movq 8(%rsi), %r11 # prefetch a[1]
mulq %rcx # p = a[0] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 0(%rdi) # r[0] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 16(%rsi), %r11 # prefetch a[2]
mulq %rcx # p = a[1] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 8(%rdi) # r[1] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 24(%rsi), %r11 # prefetch a[3]
mulq %rcx # p = a[2] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 16(%rdi) # r[2] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 32(%rsi), %r11 # prefetch a[4]
mulq %rcx # p = a[3] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 24(%rdi) # r[3] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 40(%rsi), %r11 # prefetch a[5]
mulq %rcx # p = a[4] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 32(%rdi) # r[4] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 48(%rsi), %r11 # prefetch a[6]
mulq %rcx # p = a[5] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 40(%rdi) # r[5] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 56(%rsi), %r11 # prefetch a[7]
mulq %rcx # p = a[6] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 48(%rdi) # r[6] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
mulq %rcx # p = a[7] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 56(%rdi) # r[7] = lo(p)
movq %rdx, %r9 # cy = hi(p)
addq $64, %rsi
addq $64, %rdi
subq $8, %r8
jz .L17
jmp .L15
.L16:
movq 0(%rsi), %rax
mulq %rcx # p = a[0] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 0(%rdi) # r[0] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L17
movq 8(%rsi), %rax
mulq %rcx # p = a[1] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 8(%rdi) # r[1] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L17
movq 16(%rsi), %rax
mulq %rcx # p = a[2] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 16(%rdi) # r[2] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L17
movq 24(%rsi), %rax
mulq %rcx # p = a[3] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 24(%rdi) # r[3] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L17
movq 32(%rsi), %rax
mulq %rcx # p = a[4] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 32(%rdi) # r[4] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L17
movq 40(%rsi), %rax
mulq %rcx # p = a[5] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 40(%rdi) # r[5] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L17
movq 48(%rsi), %rax
mulq %rcx # p = a[6] * digit
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 48(%rdi) # r[6] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L17
.L17:
movq %r9, %rax
ret
.size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
# ------------------------------------------------------------------------
#
# Implementation of s_mpv_mul_add_vec which exploits
# the 64X64->128 bit unsigned multiply instruction.
#
# ------------------------------------------------------------------------
# r += a * digit, r and a are vectors of length len
# returns the carry digit
# r and a are 64 bit aligned.
#
# uint64_t
# s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
#
.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
xorq %rax, %rax # if (len == 0) return (0)
testq %rdx, %rdx
jz .L27
movq %rdx, %r8 # Use r8 for len; %rdx is used by mul
xorq %r9, %r9 # cy = 0
.L25:
cmpq $8, %r8 # 8 - len
jb .L26
movq 0(%rsi), %rax # rax = a[0]
movq 0(%rdi), %r10 # r10 = r[0]
movq 8(%rsi), %r11 # prefetch a[1]
mulq %rcx # p = a[0] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[0]
movq 8(%rdi), %r10 # prefetch r[1]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 0(%rdi) # r[0] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 16(%rsi), %r11 # prefetch a[2]
mulq %rcx # p = a[1] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[1]
movq 16(%rdi), %r10 # prefetch r[2]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 8(%rdi) # r[1] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 24(%rsi), %r11 # prefetch a[3]
mulq %rcx # p = a[2] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[2]
movq 24(%rdi), %r10 # prefetch r[3]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 16(%rdi) # r[2] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 32(%rsi), %r11 # prefetch a[4]
mulq %rcx # p = a[3] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[3]
movq 32(%rdi), %r10 # prefetch r[4]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 24(%rdi) # r[3] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 40(%rsi), %r11 # prefetch a[5]
mulq %rcx # p = a[4] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[4]
movq 40(%rdi), %r10 # prefetch r[5]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 32(%rdi) # r[4] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 48(%rsi), %r11 # prefetch a[6]
mulq %rcx # p = a[5] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[5]
movq 48(%rdi), %r10 # prefetch r[6]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 40(%rdi) # r[5] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
movq 56(%rsi), %r11 # prefetch a[7]
mulq %rcx # p = a[6] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[6]
movq 56(%rdi), %r10 # prefetch r[7]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 48(%rdi) # r[6] = lo(p)
movq %rdx, %r9 # cy = hi(p)
movq %r11, %rax
mulq %rcx # p = a[7] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[7]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 56(%rdi) # r[7] = lo(p)
movq %rdx, %r9 # cy = hi(p)
addq $64, %rsi
addq $64, %rdi
subq $8, %r8
jz .L27
jmp .L25
.L26:
movq 0(%rsi), %rax
movq 0(%rdi), %r10
mulq %rcx # p = a[0] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[0]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 0(%rdi) # r[0] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L27
movq 8(%rsi), %rax
movq 8(%rdi), %r10
mulq %rcx # p = a[1] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[1]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 8(%rdi) # r[1] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L27
movq 16(%rsi), %rax
movq 16(%rdi), %r10
mulq %rcx # p = a[2] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[2]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 16(%rdi) # r[2] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L27
movq 24(%rsi), %rax
movq 24(%rdi), %r10
mulq %rcx # p = a[3] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[3]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 24(%rdi) # r[3] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L27
movq 32(%rsi), %rax
movq 32(%rdi), %r10
mulq %rcx # p = a[4] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[4]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 32(%rdi) # r[4] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L27
movq 40(%rsi), %rax
movq 40(%rdi), %r10
mulq %rcx # p = a[5] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[5]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 40(%rdi) # r[5] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L27
movq 48(%rsi), %rax
movq 48(%rdi), %r10
mulq %rcx # p = a[6] * digit
addq %r10, %rax
adcq $0, %rdx # p += r[6]
addq %r9, %rax
adcq $0, %rdx # p += cy
movq %rax, 48(%rdi) # r[6] = lo(p)
movq %rdx, %r9 # cy = hi(p)
decq %r8
jz .L27
.L27:
movq %r9, %rax
ret
.size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]

Просмотреть файл

@ -0,0 +1,418 @@
/ ***** BEGIN LICENSE BLOCK *****
/ Version: MPL 1.1/GPL 2.0/LGPL 2.1
/
/ The contents of this file are subject to the Mozilla Public License Version
/ 1.1 (the "License"); you may not use this file except in compliance with
/ the License. You may obtain a copy of the License at
/ http://www.mozilla.org/MPL/
/
/ Software distributed under the License is distributed on an "AS IS" basis,
/ WITHOUT WARRANTY OF ANY KIND, either express or implied. See the License
/ for the specific language governing rights and limitations under the
/ License.
/
/ The Original Code is the Solaris software cryptographic token.
/
/ The Initial Developer of the Original Code is
/ Sun Microsystems, Inc.
/ Portions created by the Initial Developer are Copyright (C) 2005
/ the Initial Developer. All Rights Reserved.
/
/ Contributor(s):
/ Sun Microsystems, Inc.
/
/ Alternatively, the contents of this file may be used under the terms of
/ either the GNU General Public License Version 2 or later (the "GPL"), or
/ the GNU Lesser General Public License Version 2.1 or later (the "LGPL"),
/ in which case the provisions of the GPL or the LGPL are applicable instead
/ of those above. If you wish to allow use of your version of this file only
/ under the terms of either the GPL or the LGPL, and not to allow others to
/ use your version of this file under the terms of the MPL, indicate your
/ decision by deleting the provisions above and replace them with the notice
/ and other provisions required by the GPL or the LGPL. If you do not delete
/ the provisions above, a recipient may use your version of this file under
/ the terms of any one of the MPL, the GPL or the LGPL.
/
/ ***** END LICENSE BLOCK ***** */
/ ------------------------------------------------------------------------
/
/ Implementation of s_mpv_mul_set_vec which exploits
/ the 64X64->128 bit unsigned multiply instruction.
/
/ ------------------------------------------------------------------------
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ r and a are 64 bit aligned.
/
/ uint64_t
/ s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
/
.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
xorq %rax, %rax / if (len == 0) return (0)
testq %rdx, %rdx
jz .L17
movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
xorq %r9, %r9 / cy = 0
.L15:
cmpq $8, %r8 / 8 - len
jb .L16
movq 0(%rsi), %rax / rax = a[0]
movq 8(%rsi), %r11 / prefetch a[1]
mulq %rcx / p = a[0] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 0(%rdi) / r[0] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 16(%rsi), %r11 / prefetch a[2]
mulq %rcx / p = a[1] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 8(%rdi) / r[1] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 24(%rsi), %r11 / prefetch a[3]
mulq %rcx / p = a[2] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 16(%rdi) / r[2] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 32(%rsi), %r11 / prefetch a[4]
mulq %rcx / p = a[3] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 24(%rdi) / r[3] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 40(%rsi), %r11 / prefetch a[5]
mulq %rcx / p = a[4] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 32(%rdi) / r[4] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 48(%rsi), %r11 / prefetch a[6]
mulq %rcx / p = a[5] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 40(%rdi) / r[5] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 56(%rsi), %r11 / prefetch a[7]
mulq %rcx / p = a[6] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 48(%rdi) / r[6] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
mulq %rcx / p = a[7] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 56(%rdi) / r[7] = lo(p)
movq %rdx, %r9 / cy = hi(p)
addq $64, %rsi
addq $64, %rdi
subq $8, %r8
jz .L17
jmp .L15
.L16:
movq 0(%rsi), %rax
mulq %rcx / p = a[0] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 0(%rdi) / r[0] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 8(%rsi), %rax
mulq %rcx / p = a[1] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 8(%rdi) / r[1] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 16(%rsi), %rax
mulq %rcx / p = a[2] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 16(%rdi) / r[2] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 24(%rsi), %rax
mulq %rcx / p = a[3] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 24(%rdi) / r[3] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 32(%rsi), %rax
mulq %rcx / p = a[4] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 32(%rdi) / r[4] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 40(%rsi), %rax
mulq %rcx / p = a[5] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 40(%rdi) / r[5] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 48(%rsi), %rax
mulq %rcx / p = a[6] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 48(%rdi) / r[6] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
.L17:
movq %r9, %rax
ret
.size s_mpv_mul_set_vec64, [.-s_mpv_mul_set_vec64]
/ ------------------------------------------------------------------------
/
/ Implementation of s_mpv_mul_add_vec which exploits
/ the 64X64->128 bit unsigned multiply instruction.
/
/ ------------------------------------------------------------------------
/ r += a * digit, r and a are vectors of length len
/ returns the carry digit
/ r and a are 64 bit aligned.
/
/ uint64_t
/ s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
/
.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
xorq %rax, %rax / if (len == 0) return (0)
testq %rdx, %rdx
jz .L27
movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
xorq %r9, %r9 / cy = 0
.L25:
cmpq $8, %r8 / 8 - len
jb .L26
movq 0(%rsi), %rax / rax = a[0]
movq 0(%rdi), %r10 / r10 = r[0]
movq 8(%rsi), %r11 / prefetch a[1]
mulq %rcx / p = a[0] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[0]
movq 8(%rdi), %r10 / prefetch r[1]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 0(%rdi) / r[0] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 16(%rsi), %r11 / prefetch a[2]
mulq %rcx / p = a[1] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[1]
movq 16(%rdi), %r10 / prefetch r[2]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 8(%rdi) / r[1] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 24(%rsi), %r11 / prefetch a[3]
mulq %rcx / p = a[2] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[2]
movq 24(%rdi), %r10 / prefetch r[3]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 16(%rdi) / r[2] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 32(%rsi), %r11 / prefetch a[4]
mulq %rcx / p = a[3] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[3]
movq 32(%rdi), %r10 / prefetch r[4]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 24(%rdi) / r[3] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 40(%rsi), %r11 / prefetch a[5]
mulq %rcx / p = a[4] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[4]
movq 40(%rdi), %r10 / prefetch r[5]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 32(%rdi) / r[4] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 48(%rsi), %r11 / prefetch a[6]
mulq %rcx / p = a[5] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[5]
movq 48(%rdi), %r10 / prefetch r[6]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 40(%rdi) / r[5] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 56(%rsi), %r11 / prefetch a[7]
mulq %rcx / p = a[6] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[6]
movq 56(%rdi), %r10 / prefetch r[7]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 48(%rdi) / r[6] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
mulq %rcx / p = a[7] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[7]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 56(%rdi) / r[7] = lo(p)
movq %rdx, %r9 / cy = hi(p)
addq $64, %rsi
addq $64, %rdi
subq $8, %r8
jz .L27
jmp .L25
.L26:
movq 0(%rsi), %rax
movq 0(%rdi), %r10
mulq %rcx / p = a[0] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[0]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 0(%rdi) / r[0] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 8(%rsi), %rax
movq 8(%rdi), %r10
mulq %rcx / p = a[1] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[1]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 8(%rdi) / r[1] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 16(%rsi), %rax
movq 16(%rdi), %r10
mulq %rcx / p = a[2] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[2]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 16(%rdi) / r[2] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 24(%rsi), %rax
movq 24(%rdi), %r10
mulq %rcx / p = a[3] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[3]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 24(%rdi) / r[3] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 32(%rsi), %rax
movq 32(%rdi), %r10
mulq %rcx / p = a[4] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[4]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 32(%rdi) / r[4] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 40(%rsi), %rax
movq 40(%rdi), %r10
mulq %rcx / p = a[5] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[5]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 40(%rdi) / r[5] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 48(%rsi), %rax
movq 48(%rdi), %r10
mulq %rcx / p = a[6] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[6]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 48(%rdi) / r[6] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
.L27:
movq %r9, %rax
ret
.size s_mpv_mul_add_vec64, [.-s_mpv_mul_add_vec64]

Просмотреть файл

@ -217,3 +217,14 @@ CFLAGS= -O2 -fPIC -DLINUX1_2 -Di386 -D_XOPEN_SOURCE -DLINUX2_1 -ansi -Wall \
-pipe -DLINUX -Dlinux -D_POSIX_SOURCE -D_BSD_SOURCE -DHAVE_STRERROR \
-DXP_UNIX -UDEBUG -DNDEBUG -D_REENTRANT $(MPICMN)
endif
ifeq ($(TARGET),AMD64SOLARIS)
ASFLAGS += -xarch=generic64
AS_OBJS = mpi_amd64.o mpi_amd64_sun.o
MP_CONFIG = -DMP_ASSEMBLY_MULTIPLY -DMPI_AMD64
CFLAGS = -xarch=generic64 -xO4 -I. -DMP_API_COMPATIBLE -DMP_IOFUNC $(MP_CONFIG)
MPICMN += $(MP_CONFIG)
mpi_amd64_asm.o: mpi_amd64_sun.s
$(AS) -xarch=generic64 -P -D_ASM mpi_amd64_sun.s
endif