libprio/mpi/mpi_amd64_sun.s

386 строки
8.6 KiB
ArmAsm

/ This Source Code Form is subject to the terms of the Mozilla Public
/ License, v. 2.0. If a copy of the MPL was not distributed with this
/ file, You can obtain one at http://mozilla.org/MPL/2.0/.
/ ------------------------------------------------------------------------
/
/ Implementation of s_mpv_mul_set_vec which exploits
/ the 64X64->128 bit unsigned multiply instruction.
/
/ ------------------------------------------------------------------------
/ r = a * digit, r and a are vectors of length len
/ returns the carry digit
/ r and a are 64 bit aligned.
/
/ uint64_t
/ s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
/
.text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64:
xorq %rax, %rax / if (len == 0) return (0)
testq %rdx, %rdx
jz .L17
movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
xorq %r9, %r9 / cy = 0
.L15:
cmpq $8, %r8 / 8 - len
jb .L16
movq 0(%rsi), %rax / rax = a[0]
movq 8(%rsi), %r11 / prefetch a[1]
mulq %rcx / p = a[0] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 0(%rdi) / r[0] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 16(%rsi), %r11 / prefetch a[2]
mulq %rcx / p = a[1] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 8(%rdi) / r[1] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 24(%rsi), %r11 / prefetch a[3]
mulq %rcx / p = a[2] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 16(%rdi) / r[2] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 32(%rsi), %r11 / prefetch a[4]
mulq %rcx / p = a[3] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 24(%rdi) / r[3] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 40(%rsi), %r11 / prefetch a[5]
mulq %rcx / p = a[4] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 32(%rdi) / r[4] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 48(%rsi), %r11 / prefetch a[6]
mulq %rcx / p = a[5] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 40(%rdi) / r[5] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 56(%rsi), %r11 / prefetch a[7]
mulq %rcx / p = a[6] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 48(%rdi) / r[6] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
mulq %rcx / p = a[7] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 56(%rdi) / r[7] = lo(p)
movq %rdx, %r9 / cy = hi(p)
addq $64, %rsi
addq $64, %rdi
subq $8, %r8
jz .L17
jmp .L15
.L16:
movq 0(%rsi), %rax
mulq %rcx / p = a[0] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 0(%rdi) / r[0] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 8(%rsi), %rax
mulq %rcx / p = a[1] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 8(%rdi) / r[1] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 16(%rsi), %rax
mulq %rcx / p = a[2] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 16(%rdi) / r[2] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 24(%rsi), %rax
mulq %rcx / p = a[3] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 24(%rdi) / r[3] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 32(%rsi), %rax
mulq %rcx / p = a[4] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 32(%rdi) / r[4] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 40(%rsi), %rax
mulq %rcx / p = a[5] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 40(%rdi) / r[5] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
movq 48(%rsi), %rax
mulq %rcx / p = a[6] * digit
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 48(%rdi) / r[6] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L17
.L17:
movq %r9, %rax
ret
.size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64
/ ------------------------------------------------------------------------
/
/ Implementation of s_mpv_mul_add_vec which exploits
/ the 64X64->128 bit unsigned multiply instruction.
/
/ ------------------------------------------------------------------------
/ r += a * digit, r and a are vectors of length len
/ returns the carry digit
/ r and a are 64 bit aligned.
/
/ uint64_t
/ s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit)
/
.text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64:
xorq %rax, %rax / if (len == 0) return (0)
testq %rdx, %rdx
jz .L27
movq %rdx, %r8 / Use r8 for len; %rdx is used by mul
xorq %r9, %r9 / cy = 0
.L25:
cmpq $8, %r8 / 8 - len
jb .L26
movq 0(%rsi), %rax / rax = a[0]
movq 0(%rdi), %r10 / r10 = r[0]
movq 8(%rsi), %r11 / prefetch a[1]
mulq %rcx / p = a[0] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[0]
movq 8(%rdi), %r10 / prefetch r[1]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 0(%rdi) / r[0] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 16(%rsi), %r11 / prefetch a[2]
mulq %rcx / p = a[1] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[1]
movq 16(%rdi), %r10 / prefetch r[2]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 8(%rdi) / r[1] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 24(%rsi), %r11 / prefetch a[3]
mulq %rcx / p = a[2] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[2]
movq 24(%rdi), %r10 / prefetch r[3]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 16(%rdi) / r[2] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 32(%rsi), %r11 / prefetch a[4]
mulq %rcx / p = a[3] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[3]
movq 32(%rdi), %r10 / prefetch r[4]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 24(%rdi) / r[3] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 40(%rsi), %r11 / prefetch a[5]
mulq %rcx / p = a[4] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[4]
movq 40(%rdi), %r10 / prefetch r[5]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 32(%rdi) / r[4] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 48(%rsi), %r11 / prefetch a[6]
mulq %rcx / p = a[5] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[5]
movq 48(%rdi), %r10 / prefetch r[6]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 40(%rdi) / r[5] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
movq 56(%rsi), %r11 / prefetch a[7]
mulq %rcx / p = a[6] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[6]
movq 56(%rdi), %r10 / prefetch r[7]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 48(%rdi) / r[6] = lo(p)
movq %rdx, %r9 / cy = hi(p)
movq %r11, %rax
mulq %rcx / p = a[7] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[7]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 56(%rdi) / r[7] = lo(p)
movq %rdx, %r9 / cy = hi(p)
addq $64, %rsi
addq $64, %rdi
subq $8, %r8
jz .L27
jmp .L25
.L26:
movq 0(%rsi), %rax
movq 0(%rdi), %r10
mulq %rcx / p = a[0] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[0]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 0(%rdi) / r[0] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 8(%rsi), %rax
movq 8(%rdi), %r10
mulq %rcx / p = a[1] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[1]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 8(%rdi) / r[1] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 16(%rsi), %rax
movq 16(%rdi), %r10
mulq %rcx / p = a[2] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[2]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 16(%rdi) / r[2] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 24(%rsi), %rax
movq 24(%rdi), %r10
mulq %rcx / p = a[3] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[3]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 24(%rdi) / r[3] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 32(%rsi), %rax
movq 32(%rdi), %r10
mulq %rcx / p = a[4] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[4]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 32(%rdi) / r[4] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 40(%rsi), %rax
movq 40(%rdi), %r10
mulq %rcx / p = a[5] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[5]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 40(%rdi) / r[5] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
movq 48(%rsi), %rax
movq 48(%rdi), %r10
mulq %rcx / p = a[6] * digit
addq %r10, %rax
adcq $0, %rdx / p += r[6]
addq %r9, %rax
adcq $0, %rdx / p += cy
movq %rax, 48(%rdi) / r[6] = lo(p)
movq %rdx, %r9 / cy = hi(p)
decq %r8
jz .L27
.L27:
movq %r9, %rax
ret
.size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64