| / This Source Code Form is subject to the terms of the Mozilla Public |
| / License, v. 2.0. If a copy of the MPL was not distributed with this |
| / file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| |
| / ------------------------------------------------------------------------ |
| / |
| / Implementation of s_mpv_mul_set_vec which exploits |
| / the 64X64->128 bit unsigned multiply instruction. |
| / |
| / ------------------------------------------------------------------------ |
| |
| / r = a * digit, r and a are vectors of length len |
| / returns the carry digit |
| / r and a are 64 bit aligned. |
| / |
| / uint64_t |
| / s_mpv_mul_set_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
| / |
| |
| .text; .align 16; .globl s_mpv_mul_set_vec64; .type s_mpv_mul_set_vec64, @function; s_mpv_mul_set_vec64: |
| |
| xorq %rax, %rax / if (len == 0) return (0) |
| testq %rdx, %rdx |
| jz .L17 |
| |
| movq %rdx, %r8 / Use r8 for len; %rdx is used by mul |
| xorq %r9, %r9 / cy = 0 |
| |
| .L15: |
| cmpq $8, %r8 / 8 - len |
| jb .L16 |
| movq 0(%rsi), %rax / rax = a[0] |
| movq 8(%rsi), %r11 / prefetch a[1] |
| mulq %rcx / p = a[0] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 0(%rdi) / r[0] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 16(%rsi), %r11 / prefetch a[2] |
| mulq %rcx / p = a[1] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 8(%rdi) / r[1] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 24(%rsi), %r11 / prefetch a[3] |
| mulq %rcx / p = a[2] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 16(%rdi) / r[2] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 32(%rsi), %r11 / prefetch a[4] |
| mulq %rcx / p = a[3] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 24(%rdi) / r[3] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 40(%rsi), %r11 / prefetch a[5] |
| mulq %rcx / p = a[4] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 32(%rdi) / r[4] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 48(%rsi), %r11 / prefetch a[6] |
| mulq %rcx / p = a[5] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 40(%rdi) / r[5] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 56(%rsi), %r11 / prefetch a[7] |
| mulq %rcx / p = a[6] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 48(%rdi) / r[6] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| mulq %rcx / p = a[7] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 56(%rdi) / r[7] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| addq $64, %rsi |
| addq $64, %rdi |
| subq $8, %r8 |
| |
| jz .L17 |
| jmp .L15 |
| |
| .L16: |
| movq 0(%rsi), %rax |
| mulq %rcx / p = a[0] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 0(%rdi) / r[0] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L17 |
| |
| movq 8(%rsi), %rax |
| mulq %rcx / p = a[1] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 8(%rdi) / r[1] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L17 |
| |
| movq 16(%rsi), %rax |
| mulq %rcx / p = a[2] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 16(%rdi) / r[2] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L17 |
| |
| movq 24(%rsi), %rax |
| mulq %rcx / p = a[3] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 24(%rdi) / r[3] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L17 |
| |
| movq 32(%rsi), %rax |
| mulq %rcx / p = a[4] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 32(%rdi) / r[4] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L17 |
| |
| movq 40(%rsi), %rax |
| mulq %rcx / p = a[5] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 40(%rdi) / r[5] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L17 |
| |
| movq 48(%rsi), %rax |
| mulq %rcx / p = a[6] * digit |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 48(%rdi) / r[6] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L17 |
| |
| |
| .L17: |
| movq %r9, %rax |
| ret |
| |
| .size s_mpv_mul_set_vec64, .-s_mpv_mul_set_vec64 |
| |
| / ------------------------------------------------------------------------ |
| / |
| / Implementation of s_mpv_mul_add_vec which exploits |
| / the 64X64->128 bit unsigned multiply instruction. |
| / |
| / ------------------------------------------------------------------------ |
| |
| / r += a * digit, r and a are vectors of length len |
| / returns the carry digit |
| / r and a are 64 bit aligned. |
| / |
| / uint64_t |
| / s_mpv_mul_add_vec64(uint64_t *r, uint64_t *a, int len, uint64_t digit) |
| / |
| |
| .text; .align 16; .globl s_mpv_mul_add_vec64; .type s_mpv_mul_add_vec64, @function; s_mpv_mul_add_vec64: |
| |
| xorq %rax, %rax / if (len == 0) return (0) |
| testq %rdx, %rdx |
| jz .L27 |
| |
| movq %rdx, %r8 / Use r8 for len; %rdx is used by mul |
| xorq %r9, %r9 / cy = 0 |
| |
| .L25: |
| cmpq $8, %r8 / 8 - len |
| jb .L26 |
| movq 0(%rsi), %rax / rax = a[0] |
| movq 0(%rdi), %r10 / r10 = r[0] |
| movq 8(%rsi), %r11 / prefetch a[1] |
| mulq %rcx / p = a[0] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[0] |
| movq 8(%rdi), %r10 / prefetch r[1] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 0(%rdi) / r[0] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 16(%rsi), %r11 / prefetch a[2] |
| mulq %rcx / p = a[1] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[1] |
| movq 16(%rdi), %r10 / prefetch r[2] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 8(%rdi) / r[1] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 24(%rsi), %r11 / prefetch a[3] |
| mulq %rcx / p = a[2] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[2] |
| movq 24(%rdi), %r10 / prefetch r[3] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 16(%rdi) / r[2] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 32(%rsi), %r11 / prefetch a[4] |
| mulq %rcx / p = a[3] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[3] |
| movq 32(%rdi), %r10 / prefetch r[4] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 24(%rdi) / r[3] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 40(%rsi), %r11 / prefetch a[5] |
| mulq %rcx / p = a[4] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[4] |
| movq 40(%rdi), %r10 / prefetch r[5] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 32(%rdi) / r[4] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 48(%rsi), %r11 / prefetch a[6] |
| mulq %rcx / p = a[5] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[5] |
| movq 48(%rdi), %r10 / prefetch r[6] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 40(%rdi) / r[5] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| movq 56(%rsi), %r11 / prefetch a[7] |
| mulq %rcx / p = a[6] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[6] |
| movq 56(%rdi), %r10 / prefetch r[7] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 48(%rdi) / r[6] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| movq %r11, %rax |
| mulq %rcx / p = a[7] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[7] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 56(%rdi) / r[7] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| |
| addq $64, %rsi |
| addq $64, %rdi |
| subq $8, %r8 |
| |
| jz .L27 |
| jmp .L25 |
| |
| .L26: |
| movq 0(%rsi), %rax |
| movq 0(%rdi), %r10 |
| mulq %rcx / p = a[0] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[0] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 0(%rdi) / r[0] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L27 |
| |
| movq 8(%rsi), %rax |
| movq 8(%rdi), %r10 |
| mulq %rcx / p = a[1] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[1] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 8(%rdi) / r[1] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L27 |
| |
| movq 16(%rsi), %rax |
| movq 16(%rdi), %r10 |
| mulq %rcx / p = a[2] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[2] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 16(%rdi) / r[2] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L27 |
| |
| movq 24(%rsi), %rax |
| movq 24(%rdi), %r10 |
| mulq %rcx / p = a[3] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[3] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 24(%rdi) / r[3] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L27 |
| |
| movq 32(%rsi), %rax |
| movq 32(%rdi), %r10 |
| mulq %rcx / p = a[4] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[4] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 32(%rdi) / r[4] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L27 |
| |
| movq 40(%rsi), %rax |
| movq 40(%rdi), %r10 |
| mulq %rcx / p = a[5] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[5] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 40(%rdi) / r[5] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L27 |
| |
| movq 48(%rsi), %rax |
| movq 48(%rdi), %r10 |
| mulq %rcx / p = a[6] * digit |
| addq %r10, %rax |
| adcq $0, %rdx / p += r[6] |
| addq %r9, %rax |
| adcq $0, %rdx / p += cy |
| movq %rax, 48(%rdi) / r[6] = lo(p) |
| movq %rdx, %r9 / cy = hi(p) |
| decq %r8 |
| jz .L27 |
| |
| |
| .L27: |
| movq %r9, %rax |
| ret |
| |
| .size s_mpv_mul_add_vec64, .-s_mpv_mul_add_vec64 |