| |
| ; ============================================================================ |
| ; bandwidth 0.23, a benchmark to estimate memory transfer bandwidth. |
| ; Copyright (C) 2005-2010 by Zack T Smith. |
| ; |
| ; This program is free software; you can redistribute it and/or modify |
| ; it under the terms of the GNU General Public License as published by |
| ; the Free Software Foundation; either version 2 of the License, or |
| ; (at your option) any later version. |
| ; |
| ; This program is distributed in the hope that it will be useful, |
| ; but WITHOUT ANY WARRANTY; without even the implied warranty of |
| ; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
| ; GNU General Public License for more details. |
| ; |
| ; You should have received a copy of the GNU General Public License |
| ; along with this program; if not, write to the Free Software |
| ; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA |
| ; |
| ; The author may be reached at fbui@comcast.net. |
| ; ============================================================================= |
| |
| bits 64 |
| cpu x64 |
| |
| global Reader |
| global RandomReader |
| global ReaderSSE2 |
| global RandomReaderSSE2 |
| |
| global Writer |
| global RandomWriter |
| global WriterSSE2 |
| global RandomWriterSSE2 |
| |
| global WriterSSE2_bypass |
| global RandomWriterSSE2_bypass |
| |
| global CopySSE |
| global _CopySSE |
| |
| global has_sse2 |
| |
| global RegisterToRegister |
| global RegisterToVector |
| global VectorToRegister |
| global VectorToVector |
| |
| global Register8ToVector |
| global Register16ToVector |
| global Register32ToVector |
| global Register64ToVector |
| global Vector8ToRegister |
| global Vector16ToRegister |
| global Vector32ToRegister |
| global Vector64ToRegister |
| |
| global StackReader |
| global StackWriter |
| |
| global _Reader |
| global _RandomReader |
| global _ReaderSSE2 |
| global _RandomReaderSSE2 |
| |
| global _Writer |
| global _RandomWriter |
| global _WriterSSE2 |
| global _RandomWriterSSE2 |
| |
| global _WriterSSE2_bypass |
| global _RandomWriterSSE2_bypass |
| |
| global _has_sse2 |
| |
| global _RegisterToRegister |
| global _RegisterToVector |
| global _VectorToRegister |
| global _VectorToVector |
| |
| global _Register8ToVector |
| global _Register16ToVector |
| global _Register32ToVector |
| global _Register64ToVector |
| global _Vector8ToRegister |
| global _Vector16ToRegister |
| global _Vector32ToRegister |
| global _Vector64ToRegister |
| |
| global _StackReader |
| global _StackWriter |
| |
| ; Note: |
| ; Unix ABI says integer param are put in these registers in this order: |
| ; rdi, rsi, rdx, rcx, r8, r9 |
| |
| section .text |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: has_sse2 |
| ; |
| has_sse2: |
| _has_sse2: |
| push rbx |
| push rcx |
| push rdx |
| mov rax, 1 |
| cpuid |
| test rdx, 0x4000000 |
| setnz al |
| pop rdx |
| pop rcx |
| pop rbx |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Reader |
| ; Purpose: Reads 64-bit values sequentially from an area of memory. |
| ; Params: rdi = ptr to memory area |
| ; rsi = length in bytes |
| ; rdx = loops |
| ;------------------------------------------------------------------------------ |
| Reader: |
| _Reader: |
| push r10 |
| |
| add rsi, rdi ; rdi now points to end. |
| |
| .L1: |
| mov r10, rdi |
| |
| .L2: |
| mov rax, [r10] |
| mov rax, [8+r10] |
| mov rax, [16+r10] |
| mov rax, [24+r10] |
| mov rax, [32+r10] |
| mov rax, [40+r10] |
| mov rax, [48+r10] |
| mov rax, [56+r10] |
| mov rax, [64+r10] |
| mov rax, [72+r10] |
| mov rax, [80+r10] |
| mov rax, [88+r10] |
| mov rax, [96+r10] |
| mov rax, [104+r10] |
| mov rax, [112+r10] |
| mov rax, [120+r10] |
| mov rax, [128+r10] |
| mov rax, [136+r10] |
| mov rax, [144+r10] |
| mov rax, [152+r10] |
| mov rax, [160+r10] |
| mov rax, [168+r10] |
| mov rax, [176+r10] |
| mov rax, [184+r10] |
| mov rax, [192+r10] |
| mov rax, [200+r10] |
| mov rax, [208+r10] |
| mov rax, [216+r10] |
| mov rax, [224+r10] |
| mov rax, [232+r10] |
| mov rax, [240+r10] |
| mov rax, [248+r10] |
| |
| add r10, 256 |
| cmp r10, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: RandomReader |
| ; Purpose: Reads 64-bit values randomly from an area of memory. |
| ; Params: rdi = ptr to array of chunk pointers |
| ; rsi = # of chunks |
| ; rdx = loops |
| ;------------------------------------------------------------------------------ |
| RandomReader: |
| _RandomReader: |
| push r10 |
| push r11 |
| |
| .L1: |
| xor r11, r11 |
| |
| .L2: |
| mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. |
| |
| mov rax, [96+r10] |
| mov rax, [r10] |
| mov rax, [120+r10] |
| mov rax, [184+r10] |
| mov rax, [160+r10] |
| mov rax, [176+r10] |
| mov rax, [112+r10] |
| mov rax, [80+r10] |
| mov rax, [32+r10] |
| mov rax, [128+r10] |
| mov rax, [88+r10] |
| mov rax, [40+r10] |
| mov rax, [48+r10] |
| mov rax, [72+r10] |
| mov rax, [200+r10] |
| mov rax, [24+r10] |
| mov rax, [152+r10] |
| mov rax, [16+r10] |
| mov rax, [248+r10] |
| mov rax, [56+r10] |
| mov rax, [240+r10] |
| mov rax, [208+r10] |
| mov rax, [104+r10] |
| mov rax, [216+r10] |
| mov rax, [136+r10] |
| mov rax, [232+r10] |
| mov rax, [64+r10] |
| mov rax, [224+r10] |
| mov rax, [144+r10] |
| mov rax, [192+r10] |
| mov rax, [8+r10] |
| mov rax, [168+r10] |
| |
| inc r11 |
| cmp r11, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r11 |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: RandomReaderSSE2 |
| ; Purpose: Reads 128-bit values randomly from an area of memory. |
| ; Params: rdi = ptr to array of chunk pointers |
| ; rsi = # of chunks |
| ; rdx = loops |
| ;------------------------------------------------------------------------------ |
| RandomReaderSSE2: |
| _RandomReaderSSE2: |
| push r10 |
| push r11 |
| |
| .L1: |
| xor r11, r11 |
| |
| .L2: |
| mov r10, [rdi + 8*r11] |
| |
| movdqa xmm0, [240+r10] |
| movdqa xmm0, [128+r10] |
| movdqa xmm0, [64+r10] |
| movdqa xmm0, [208+r10] |
| movdqa xmm0, [112+r10] |
| movdqa xmm0, [176+r10] |
| movdqa xmm0, [144+r10] |
| movdqa xmm0, [r10] |
| movdqa xmm0, [96+r10] |
| movdqa xmm0, [16+r10] |
| movdqa xmm0, [192+r10] |
| movdqa xmm0, [160+r10] |
| movdqa xmm0, [32+r10] |
| movdqa xmm0, [48+r10] |
| movdqa xmm0, [224+r10] |
| movdqa xmm0, [80+r10] |
| |
| inc r11 |
| cmp r11, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r11 |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: RandomWriter |
| ; Purpose: Writes 64-bit values randomly to an area of memory. |
| ; Params: rdi = ptr to array of chunk pointers |
| ; rsi = # of chunks |
| ; rdx = loops |
| ; rcx = datum to write |
| ;------------------------------------------------------------------------------ |
| RandomWriter: |
| _RandomWriter: |
| push r10 |
| push r11 |
| |
| .L1: |
| xor r11, r11 |
| |
| .L2: |
| mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. |
| |
| mov [96+r10], rcx |
| mov [r10], rcx |
| mov [120+r10], rcx |
| mov [184+r10], rcx |
| mov [160+r10], rcx |
| mov [176+r10], rcx |
| mov [112+r10], rcx |
| mov [80+r10], rcx |
| mov [32+r10], rcx |
| mov [128+r10], rcx |
| mov [88+r10], rcx |
| mov [40+r10], rcx |
| mov [48+r10], rcx |
| mov [72+r10], rcx |
| mov [200+r10], rcx |
| mov [24+r10], rcx |
| mov [152+r10], rcx |
| mov [16+r10], rcx |
| mov [248+r10], rcx |
| mov [56+r10], rcx |
| mov [240+r10], rcx |
| mov [208+r10], rcx |
| mov [104+r10], rcx |
| mov [216+r10], rcx |
| mov [136+r10], rcx |
| mov [232+r10], rcx |
| mov [64+r10], rcx |
| mov [224+r10], rcx |
| mov [144+r10], rcx |
| mov [192+r10], rcx |
| mov [8+r10], rcx |
| mov [168+r10], rcx |
| |
| inc r11 |
| cmp r11, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r11 |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: RandomWriterSSE2 |
| ; Purpose: Writes 128-bit values randomly to an area of memory. |
| ; Params: rdi = ptr to array of chunk pointers |
| ; rsi = # of chunks |
| ; rdx = loops |
| ; rcx = datum to write |
| ;------------------------------------------------------------------------------ |
| RandomWriterSSE2: |
| _RandomWriterSSE2: |
| push r10 |
| push r11 |
| |
| movq xmm0, rcx ; Create duplicated 128-bit datum |
| movq xmm1, rcx |
| pslldq xmm1, 64 |
| por xmm0, xmm1 |
| |
| .L1: |
| xor r11, r11 |
| |
| .L2: |
| mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. |
| |
| movdqa [240+r10], xmm0 |
| movdqa [128+r10], xmm0 |
| movdqa [208+r10], xmm0 |
| movdqa [112+r10], xmm0 |
| movdqa [64+r10], xmm0 |
| movdqa [176+r10], xmm0 |
| movdqa [144+r10], xmm0 |
| movdqa [r10], xmm0 |
| movdqa [96+r10], xmm0 |
| movdqa [16+r10], xmm0 |
| movdqa [192+r10], xmm0 |
| movdqa [160+r10], xmm0 |
| movdqa [32+r10], xmm0 |
| movdqa [48+r10], xmm0 |
| movdqa [224+r10], xmm0 |
| movdqa [80+r10], xmm0 |
| |
| inc r11 |
| cmp r11, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r11 |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: RandomWriterSSE2_bypass |
| ; Purpose: Writes 128-bit values randomly into memory, bypassing caches. |
| ; Params: rdi = ptr to array of chunk pointers |
| ; rsi = # of chunks |
| ; rdx = loops |
| ; rcx = datum to write |
| ;------------------------------------------------------------------------------ |
| RandomWriterSSE2_bypass: |
| _RandomWriterSSE2_bypass: |
| push r10 |
| push r11 |
| |
| movq xmm0, rcx ; Create duplicated 128-bit datum |
| movq xmm1, rcx |
| pslldq xmm1, 64 |
| por xmm0, xmm1 |
| |
| .L1: |
| xor r11, r11 |
| |
| .L2: |
| mov r10, [rdi + 8*r11] ; Note, 64-bit pointers. |
| |
| movntdq [240+r10], xmm0 |
| movntdq [128+r10], xmm0 |
| movntdq [208+r10], xmm0 |
| movntdq [112+r10], xmm0 |
| movntdq [64+r10], xmm0 |
| movntdq [176+r10], xmm0 |
| movntdq [144+r10], xmm0 |
| movntdq [r10], xmm0 |
| movntdq [96+r10], xmm0 |
| movntdq [16+r10], xmm0 |
| movntdq [192+r10], xmm0 |
| movntdq [160+r10], xmm0 |
| movntdq [32+r10], xmm0 |
| movntdq [48+r10], xmm0 |
| movntdq [224+r10], xmm0 |
| movntdq [80+r10], xmm0 |
| |
| inc r11 |
| cmp r11, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r11 |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: ReaderSSE2 |
| ; Purpose: Reads 128-bit values sequentially from an area of memory. |
| ; Params: rdi = ptr to memory area |
| ; rsi = length in bytes |
| ; rdx = loops |
| ;------------------------------------------------------------------------------ |
| ReaderSSE2: |
| _ReaderSSE2: |
| push r10 |
| |
| add rsi, rdi ; rsi now points to end. |
| |
| .L1: |
| mov r10, rdi |
| |
| .L2: |
| movdqa xmm0, [r10] ; Read aligned to 16-byte boundary. |
| movdqa xmm0, [16+r10] |
| movdqa xmm0, [32+r10] |
| movdqa xmm0, [48+r10] |
| movdqa xmm0, [64+r10] |
| movdqa xmm0, [80+r10] |
| movdqa xmm0, [96+r10] |
| movdqa xmm0, [112+r10] |
| |
| movdqa xmm0, [128+r10] |
| movdqa xmm0, [144+r10] |
| movdqa xmm0, [160+r10] |
| movdqa xmm0, [176+r10] |
| movdqa xmm0, [192+r10] |
| movdqa xmm0, [208+r10] |
| movdqa xmm0, [224+r10] |
| movdqa xmm0, [240+r10] |
| |
| add r10, 256 |
| cmp r10, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r10 |
| ret |
| |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Writer |
| ; Purpose: Writes 64-bit value sequentially to an area of memory. |
| ; Params: rdi = ptr to memory area |
| ; rsi = length in bytes |
| ; rdx = loops |
| ; rcx = quad to write |
| ;------------------------------------------------------------------------------ |
| Writer: |
| _Writer: |
| push r10 |
| |
| add rsi, rdi ; rsi now points to end. |
| |
| .L1: |
| mov r10, rdi |
| |
| .L2: |
| mov [r10], rcx |
| mov [8+r10], rcx |
| mov [16+r10], rcx |
| mov [24+r10], rcx |
| mov [32+r10], rcx |
| mov [40+r10], rcx |
| mov [48+r10], rcx |
| mov [56+r10], rcx |
| mov [64+r10], rcx |
| mov [72+r10], rcx |
| mov [80+r10], rcx |
| mov [88+r10], rcx |
| mov [96+r10], rcx |
| mov [104+r10], rcx |
| mov [112+r10], rcx |
| mov [120+r10], rcx |
| mov [128+r10], rcx |
| mov [136+r10], rcx |
| mov [144+r10], rcx |
| mov [152+r10], rcx |
| mov [160+r10], rcx |
| mov [168+r10], rcx |
| mov [176+r10], rcx |
| mov [184+r10], rcx |
| mov [192+r10], rcx |
| mov [200+r10], rcx |
| mov [208+r10], rcx |
| mov [216+r10], rcx |
| mov [224+r10], rcx |
| mov [232+r10], rcx |
| mov [240+r10], rcx |
| mov [248+r10], rcx |
| |
| add r10, 256 |
| cmp r10, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: WriterSSE2 |
| ; Purpose: Writes 128-bit value sequentially to an area of memory. |
| ; Params: rdi = ptr to memory area |
| ; rsi = length in bytes |
| ; rdx = loops |
| ; rcx = quad to write |
| ;------------------------------------------------------------------------------ |
| WriterSSE2: |
| _WriterSSE2: |
| push r10 |
| |
| add rsi, rdi ; rsi now points to end. |
| |
| movq xmm0, rcx |
| |
| .L1: |
| mov r10, rdi |
| |
| .L2: |
| movdqa [r10], xmm0 |
| movdqa [16+r10], xmm0 |
| movdqa [32+r10], xmm0 |
| movdqa [48+r10], xmm0 |
| movdqa [64+r10], xmm0 |
| movdqa [80+r10], xmm0 |
| movdqa [96+r10], xmm0 |
| movdqa [112+r10], xmm0 |
| |
| movdqa [128+r10], xmm0 |
| movdqa [144+r10], xmm0 |
| movdqa [160+r10], xmm0 |
| movdqa [176+r10], xmm0 |
| movdqa [192+r10], xmm0 |
| movdqa [208+r10], xmm0 |
| movdqa [224+r10], xmm0 |
| movdqa [240+r10], xmm0 |
| |
| add r10, 256 |
| cmp r10, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: WriterSSE2_bypass |
| ; Purpose: Writes 128-bit value sequentially to an area of memory. |
| ; Params: rdi = ptr to memory area |
| ; rsi = length in bytes |
| ; rdx = loops |
| ; rcx = quad to write |
| ;------------------------------------------------------------------------------ |
| WriterSSE2_bypass: |
| _WriterSSE2_bypass: |
| push r10 |
| |
| add rsi, rdi ; rsi now points to end. |
| |
| movq xmm0, rcx |
| |
| .L1: |
| mov r10, rdi |
| |
| .L2: |
| movntdq [r10], xmm0 ; Write bypassing cache. |
| movntdq [16+r10], xmm0 |
| movntdq [32+r10], xmm0 |
| movntdq [48+r10], xmm0 |
| movntdq [64+r10], xmm0 |
| movntdq [80+r10], xmm0 |
| movntdq [96+r10], xmm0 |
| movntdq [112+r10], xmm0 |
| |
| movntdq [128+r10], xmm0 |
| movntdq [144+r10], xmm0 |
| movntdq [160+r10], xmm0 |
| movntdq [176+r10], xmm0 |
| movntdq [192+r10], xmm0 |
| movntdq [208+r10], xmm0 |
| movntdq [224+r10], xmm0 |
| movntdq [240+r10], xmm0 |
| |
| add r10, 256 |
| cmp r10, rsi |
| jb .L2 |
| |
| dec rdx |
| jnz .L1 |
| |
| pop r10 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: StackReader |
| ; Purpose: Reads 64-bit values off the stack into registers of |
| ; the main register set, effectively testing L1 cache access |
| ; *and* effective-address calculation speed. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| StackReader: |
| _StackReader: |
| push qword 7000 ; [rsp+48] |
| push qword 6000 ; [rsp+40] |
| push qword 5000 ; [rsp+32] |
| push qword 4000 ; [rsp+24] |
| push qword 3000 ; [rsp+16] |
| push qword 2000 ; [rsp+8] |
| push qword 1000 ; [rsp] |
| |
| .L1: |
| mov rax, [rsp] |
| mov rax, [rsp+16] |
| mov rax, [rsp+24] |
| mov rax, [rsp+32] |
| mov rax, [rsp+80] |
| mov rax, [rsp+8] |
| mov rax, [rsp+88] |
| mov rax, [rsp] |
| mov rax, [rsp] |
| mov rax, [rsp+16] |
| mov rax, [rsp+24] |
| mov rax, [rsp+32] |
| mov rax, [rsp+80] |
| mov rax, [rsp+8] |
| mov rax, [rsp+88] |
| mov rax, [rsp] |
| mov rax, [rsp] |
| mov rax, [rsp+16] |
| mov rax, [rsp+24] |
| mov rax, [rsp+32] |
| mov rax, [rsp+80] |
| mov rax, [rsp+8] |
| mov rax, [rsp+88] |
| mov rax, [rsp+8] |
| mov rax, [rsp+8] |
| mov rax, [rsp+16] |
| mov rax, [rsp+24] |
| mov rax, [rsp+32] |
| mov rax, [rsp+80] |
| mov rax, [rsp+8] |
| mov rax, [rsp+88] |
| mov rax, [rsp+8] |
| |
| sub rdi, 1 |
| jnz .L1 |
| |
| add rsp, 56 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: StackWriter |
| ; Purpose: Writes 64-bit values into the stack from registers of |
| ; the main register set, effectively testing L1 cache access |
| ; *and* effective-address calculation speed. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| StackWriter: |
| _StackWriter: |
| push qword 7000 ; [rsp+88] |
| push qword 6000 ; [rsp+80] |
| push qword 5000 ; [rsp+32] |
| push qword 4000 ; [rsp+24] |
| push qword 3000 ; [rsp+16] |
| push qword 2000 ; [rsp+8] |
| push qword 1000 ; [rsp] |
| |
| xor rax, rax |
| |
| .L1: |
| mov [rsp], rax |
| mov [rsp+16], rax |
| mov [rsp+24], rax |
| mov [rsp+32], rax |
| mov [rsp+80], rax |
| mov [rsp+8], rax |
| mov [rsp+88], rax |
| mov [rsp], rax |
| mov [rsp], rax |
| mov [rsp+16], rax |
| mov [rsp+24], rax |
| mov [rsp+32], rax |
| mov [rsp+80], rax |
| mov [rsp+8], rax |
| mov [rsp+88], rax |
| mov [rsp], rax |
| mov [rsp], rax |
| mov [rsp+16], rax |
| mov [rsp+24], rax |
| mov [rsp+32], rax |
| mov [rsp+80], rax |
| mov [rsp+8], rax |
| mov [rsp+88], rax |
| mov [rsp+8], rax |
| mov [rsp+8], rax |
| mov [rsp+16], rax |
| mov [rsp+24], rax |
| mov [rsp+32], rax |
| mov [rsp+80], rax |
| mov [rsp+8], rax |
| mov [rsp+88], rax |
| mov [rsp+8], rax |
| |
| sub rdi, 1 |
| jnz .L1 |
| |
| add rsp, 56 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: RegisterToRegister |
| ; Purpose: Reads/writes 64-bit values between registers of |
| ; the main register set. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| RegisterToRegister: |
| _RegisterToRegister: |
| .L1: |
| mov rax, rbx |
| mov rax, rcx |
| mov rax, rdx |
| mov rax, rsi |
| mov rax, rdi |
| mov rax, rbp |
| mov rax, rsp |
| mov rax, rbx |
| mov rax, rbx |
| mov rax, rcx |
| mov rax, rdx |
| mov rax, rsi |
| mov rax, rdi |
| mov rax, rbp |
| mov rax, rsp |
| mov rax, rbx |
| mov rax, rbx |
| mov rax, rcx |
| mov rax, rdx |
| mov rax, rsi |
| mov rax, rdi |
| mov rax, rbp |
| mov rax, rsp |
| mov rax, rbx |
| mov rax, rbx |
| mov rax, rcx |
| mov rax, rdx |
| mov rax, rsi |
| mov rax, rdi |
| mov rax, rbp |
| mov rax, rsp |
| mov rax, rbx |
| |
| sub rdi, 1 |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: VectorToVector |
| ; Purpose: Reads/writes 128-bit values between registers of |
| ; the vector register set, in this case XMM. |
| ; (I don't have access to anything with YMM.) |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| VectorToVector: |
| _VectorToVector: |
| .L1: |
| movdqa xmm0, xmm1 ; Each movdqa moves 16 bytes, so we need 16 |
| movdqa xmm0, xmm2 ; moves to transfer a 256 byte chunk. |
| movdqa xmm0, xmm3 |
| movdqa xmm2, xmm0 |
| movdqa xmm1, xmm2 |
| movdqa xmm2, xmm1 |
| movdqa xmm0, xmm3 |
| movdqa xmm3, xmm1 |
| |
| movdqa xmm3, xmm2 |
| movdqa xmm1, xmm3 |
| movdqa xmm2, xmm1 |
| movdqa xmm0, xmm1 |
| movdqa xmm1, xmm2 |
| movdqa xmm0, xmm1 |
| movdqa xmm0, xmm3 |
| movdqa xmm3, xmm0 |
| |
| sub rdi, 1 |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: RegisterToVector |
| ; Purpose: Writes 64-bit main register values into 128-bit vector register |
| ; clearing the upper unused bits. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| RegisterToVector: |
| _RegisterToVector: |
| .L1: |
| movq xmm1, rax ; Each movq transfers 8 bytes, so we need |
| movq xmm2, rsi ; 32 transfers to move a 256-byte chunk. |
| movq xmm3, rbx |
| movq xmm1, rcx |
| movq xmm2, rsi |
| movq xmm3, rsp |
| movq xmm0, rdi |
| movq xmm0, rdx |
| |
| movq xmm0, rax |
| movq xmm1, rsi |
| movq xmm2, rbx |
| movq xmm3, rcx |
| movq xmm0, rsi |
| movq xmm3, rsp |
| movq xmm2, rdi |
| movq xmm1, rdx |
| |
| movq xmm0, rax |
| movq xmm1, rsi |
| movq xmm2, rbx |
| movq xmm3, rcx |
| movq xmm0, rsi |
| movq xmm3, rsp |
| movq xmm2, rdi |
| movq xmm1, rdx |
| |
| movq xmm0, rax |
| movq xmm1, rsi |
| movq xmm2, rbx |
| movq xmm3, rcx |
| movq xmm0, rsi |
| movq xmm3, rsp |
| movq xmm2, rdi |
| movq xmm1, rdx |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: VectorToRegister |
| ; Purpose: Writes lower 64 bits of vector register into 64-bit main |
| ; register. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| VectorToRegister: |
| _VectorToRegister: |
| .L1: |
| movq rax, xmm1 |
| movq rax, xmm2 |
| movq rax, xmm3 |
| movq rax, xmm1 |
| movq rax, xmm2 |
| movq rax, xmm3 |
| movq rax, xmm0 |
| movq rax, xmm0 |
| |
| movq rax, xmm0 |
| movq rax, xmm1 |
| movq rax, xmm2 |
| movq rax, xmm3 |
| movq rax, xmm0 |
| movq rax, xmm3 |
| movq rax, xmm2 |
| movq rax, xmm1 |
| |
| movq rax, xmm0 |
| movq rax, xmm1 |
| movq rax, xmm2 |
| movq rax, xmm3 |
| movq rax, xmm0 |
| movq rax, xmm3 |
| movq rax, xmm2 |
| movq rax, xmm1 |
| |
| movq rax, xmm0 |
| movq rax, xmm1 |
| movq rax, xmm2 |
| movq rax, xmm3 |
| movq rax, xmm0 |
| movq rax, xmm3 |
| movq rax, xmm2 |
| movq rax, xmm1 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Register8ToVector |
| ; Purpose: Writes 8-bit main register values into 128-bit vector register |
| ; without clearing the unused bits. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| Register8ToVector: |
| _Register8ToVector: |
| sal rdi, 2 ; Force some repetition. |
| .L1: |
| pinsrb xmm1, al, 0 |
| pinsrb xmm2, bl, 1 |
| pinsrb xmm3, cl, 2 |
| pinsrb xmm1, dl, 3 |
| pinsrb xmm2, sil, 4 |
| pinsrb xmm3, dil, 5 |
| pinsrb xmm0, bpl, 6 |
| pinsrb xmm0, spl, 7 |
| |
| pinsrb xmm0, al, 0 |
| pinsrb xmm1, bl, 1 |
| pinsrb xmm2, cl, 2 |
| pinsrb xmm3, dl, 3 |
| pinsrb xmm3, al, 4 |
| pinsrb xmm2, bl, 5 |
| pinsrb xmm1, bpl, 6 |
| pinsrb xmm0, spl, 7 |
| |
| pinsrb xmm1, r8b, 0 |
| pinsrb xmm2, r9b, 1 |
| pinsrb xmm3, r10b, 2 |
| pinsrb xmm1, r11b, 3 |
| pinsrb xmm2, r12b, 4 |
| pinsrb xmm3, al, 5 |
| pinsrb xmm0, cl, 6 |
| pinsrb xmm0, bl, 7 |
| |
| pinsrb xmm0, r8b, 0 |
| pinsrb xmm0, r9b, 1 |
| pinsrb xmm0, r10b, 2 |
| pinsrb xmm0, r11b, 3 |
| pinsrb xmm0, r12b, 4 |
| pinsrb xmm0, al, 5 |
| pinsrb xmm0, cl, 6 |
| pinsrb xmm0, bl, 7 |
| |
| pinsrb xmm1, al, 0 |
| pinsrb xmm2, bl, 1 |
| pinsrb xmm3, cl, 2 |
| pinsrb xmm1, dl, 3 |
| pinsrb xmm2, sil, 4 |
| pinsrb xmm3, dil, 5 |
| pinsrb xmm0, bpl, 6 |
| pinsrb xmm0, spl, 7 |
| |
| pinsrb xmm0, al, 10 |
| pinsrb xmm1, bl, 11 |
| pinsrb xmm2, cl, 12 |
| pinsrb xmm3, dl, 13 |
| pinsrb xmm3, dil, 14 |
| pinsrb xmm2, cl, 15 |
| pinsrb xmm1, al, 6 |
| pinsrb xmm0, bpl, 7 |
| |
| pinsrb xmm1, r8b, 10 |
| pinsrb xmm2, r9b, 11 |
| pinsrb xmm3, r10b, 12 |
| pinsrb xmm1, r11b, 13 |
| pinsrb xmm2, r12b, 14 |
| pinsrb xmm3, al, 15 |
| pinsrb xmm0, cl, 6 |
| pinsrb xmm0, bl, 7 |
| |
| pinsrb xmm0, r8b, 9 |
| pinsrb xmm0, r9b, 8 |
| pinsrb xmm0, r10b, 11 |
| pinsrb xmm0, r11b, 3 |
| pinsrb xmm0, r12b, 4 |
| pinsrb xmm0, al, 5 |
| pinsrb xmm0, cl, 6 |
| pinsrb xmm0, bl, 7 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Register16ToVector |
| ; Purpose: Writes 16-bit main register values into 128-bit vector register |
| ; without clearing the unused bits. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| Register16ToVector: |
| _Register16ToVector: |
| sal rdi, 1 ; Force some repetition. |
| .L1: |
| pinsrw xmm1, ax, 0 |
| pinsrw xmm2, bx, 1 |
| pinsrw xmm3, cx, 2 |
| pinsrw xmm1, dx, 3 |
| pinsrw xmm2, si, 4 |
| pinsrw xmm3, di, 5 |
| pinsrw xmm0, bp, 6 |
| pinsrw xmm0, sp, 7 |
| |
| pinsrw xmm0, ax, 0 |
| pinsrw xmm1, bx, 1 |
| pinsrw xmm2, cx, 2 |
| pinsrw xmm3, dx, 3 |
| pinsrw xmm3, si, 4 |
| pinsrw xmm2, di, 5 |
| pinsrw xmm1, bp, 6 |
| pinsrw xmm0, sp, 7 |
| |
| pinsrw xmm1, r8w, 0 |
| pinsrw xmm2, r9w, 1 |
| pinsrw xmm3, r10w, 2 |
| pinsrw xmm1, r11w, 3 |
| pinsrw xmm2, r12w, 4 |
| pinsrw xmm3, ax, 5 |
| pinsrw xmm0, bp, 6 |
| pinsrw xmm0, bx, 7 |
| |
| pinsrw xmm0, r8w, 0 |
| pinsrw xmm0, r9w, 1 |
| pinsrw xmm0, r10w, 2 |
| pinsrw xmm0, r11w, 3 |
| pinsrw xmm0, r12w, 4 |
| pinsrw xmm0, ax, 5 |
| pinsrw xmm0, bp, 6 |
| pinsrw xmm0, bx, 7 |
| |
| pinsrw xmm1, ax, 0 |
| pinsrw xmm2, bx, 1 |
| pinsrw xmm3, cx, 2 |
| pinsrw xmm1, dx, 3 |
| pinsrw xmm2, si, 4 |
| pinsrw xmm3, di, 5 |
| pinsrw xmm0, bp, 6 |
| pinsrw xmm0, sp, 7 |
| |
| pinsrw xmm0, ax, 0 |
| pinsrw xmm1, bx, 1 |
| pinsrw xmm2, cx, 2 |
| pinsrw xmm3, dx, 3 |
| pinsrw xmm3, si, 4 |
| pinsrw xmm2, di, 5 |
| pinsrw xmm1, bp, 6 |
| pinsrw xmm0, sp, 7 |
| |
| pinsrw xmm1, r8w, 0 |
| pinsrw xmm2, r9w, 1 |
| pinsrw xmm3, r10w, 2 |
| pinsrw xmm1, r11w, 3 |
| pinsrw xmm2, r12w, 4 |
| pinsrw xmm3, ax, 5 |
| pinsrw xmm0, bp, 6 |
| pinsrw xmm0, bx, 7 |
| |
| pinsrw xmm0, r8w, 0 |
| pinsrw xmm0, r9w, 1 |
| pinsrw xmm0, r10w, 2 |
| pinsrw xmm0, r11w, 3 |
| pinsrw xmm0, r12w, 4 |
| pinsrw xmm0, ax, 5 |
| pinsrw xmm0, bp, 6 |
| pinsrw xmm0, bx, 7 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Register32ToVector |
| ; Purpose: Writes 32-bit main register values into 128-bit vector register |
| ; without clearing the unused bits. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| Register32ToVector: |
| _Register32ToVector: |
| .L1: |
| pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes |
| pinsrd xmm2, ebx, 1 ; we need 64 transfers. |
| pinsrd xmm3, ecx, 2 |
| pinsrd xmm1, edx, 3 |
| pinsrd xmm2, esi, 0 |
| pinsrd xmm3, edi, 1 |
| pinsrd xmm0, ebp, 2 |
| pinsrd xmm0, esp, 3 |
| |
| pinsrd xmm0, eax, 0 |
| pinsrd xmm1, ebx, 1 |
| pinsrd xmm2, ecx, 2 |
| pinsrd xmm3, edx, 3 |
| pinsrd xmm3, esi, 3 |
| pinsrd xmm2, edi, 2 |
| pinsrd xmm1, ebp, 1 |
| pinsrd xmm0, esp, 0 |
| |
| pinsrd xmm1, r8d, 0 |
| pinsrd xmm2, r9d, 1 |
| pinsrd xmm3, r10d, 2 |
| pinsrd xmm1, r11d, 3 |
| pinsrd xmm2, r12d, 0 |
| pinsrd xmm3, eax, 1 |
| pinsrd xmm0, ebp, 2 |
| pinsrd xmm0, ebx, 3 |
| |
| pinsrd xmm0, r8d, 0 |
| pinsrd xmm0, r9d, 1 |
| pinsrd xmm0, r10d, 2 |
| pinsrd xmm0, r11d, 3 |
| pinsrd xmm0, r12d, 0 |
| pinsrd xmm0, eax, 0 |
| pinsrd xmm0, ebp, 0 |
| pinsrd xmm0, ebx, 0 |
| |
| pinsrd xmm1, eax, 0 |
| pinsrd xmm2, ebx, 1 |
| pinsrd xmm3, ecx, 2 |
| pinsrd xmm1, edx, 3 |
| pinsrd xmm2, esi, 0 |
| pinsrd xmm3, edi, 1 |
| pinsrd xmm0, ebp, 2 |
| pinsrd xmm0, esp, 3 |
| |
| pinsrd xmm0, eax, 0 |
| pinsrd xmm1, ebx, 1 |
| pinsrd xmm2, ecx, 2 |
| pinsrd xmm3, edx, 3 |
| pinsrd xmm3, esi, 3 |
| pinsrd xmm2, edi, 2 |
| pinsrd xmm1, ebp, 1 |
| pinsrd xmm0, esp, 0 |
| |
| pinsrd xmm1, r8d, 0 |
| pinsrd xmm2, r9d, 1 |
| pinsrd xmm3, r10d, 2 |
| pinsrd xmm1, r11d, 3 |
| pinsrd xmm2, r12d, 0 |
| pinsrd xmm3, eax, 1 |
| pinsrd xmm0, ebp, 2 |
| pinsrd xmm0, ebx, 3 |
| |
| pinsrd xmm0, r8d, 0 |
| pinsrd xmm0, r9d, 1 |
| pinsrd xmm0, r10d, 2 |
| pinsrd xmm0, r11d, 3 |
| pinsrd xmm0, r12d, 0 |
| pinsrd xmm0, eax, 0 |
| pinsrd xmm0, ebp, 0 |
| pinsrd xmm0, ebx, 0 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Register64ToVector |
| ; Purpose: Writes 64-bit main register values into 128-bit vector register |
| ; without clearing the unused bits. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| Register64ToVector: |
| _Register64ToVector: |
| add rdi, rdi |
| .L1: |
| pinsrq xmm1, r8, 0 ; Each xfer moves 8 bytes, therefore to do |
| pinsrq xmm2, r9, 1 ; 256 bytes we need 32 transfers. |
| pinsrq xmm3, r10, 0 |
| pinsrq xmm1, r11, 1 |
| pinsrq xmm2, r12, 0 |
| pinsrq xmm3, rax, 1 |
| pinsrq xmm0, rbp, 0 |
| pinsrq xmm0, rbx, 1 |
| |
| pinsrq xmm0, r8, 0 |
| pinsrq xmm0, r9, 1 |
| pinsrq xmm0, r10, 1 |
| pinsrq xmm0, r11, 1 |
| pinsrq xmm0, r12, 0 |
| pinsrq xmm0, rax, 0 |
| pinsrq xmm0, rbp, 0 |
| pinsrq xmm0, rbx, 0 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Vector8ToRegister |
| ; Purpose: Writes 8-bit vector register values into main register. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| Vector8ToRegister: |
| _Vector8ToRegister: |
| sal rdi, 3 ; Force some repetition. |
| .L1: |
| pextrb rax, xmm1, 0 |
| pextrb rax, xmm2, 1 |
| pextrb rax, xmm3, 2 |
| pextrb rax, xmm1, 3 |
| pextrb rax, xmm2, 4 |
| pextrb rax, xmm3, 5 |
| pextrb rax, xmm0, 6 |
| pextrb rax, xmm0, 7 |
| |
| pextrb rax, xmm0, 0 |
| pextrb rax, xmm1, 1 |
| pextrb rax, xmm2, 2 |
| pextrb rax, xmm3, 3 |
| pextrb rax, xmm3, 4 |
| pextrb rax, xmm2, 5 |
| pextrb rax, xmm1, 6 |
| pextrb rax, xmm0, 7 |
| |
| pextrb rax, xmm1, 0 |
| pextrb rax, xmm2, 1 |
| pextrb rax, xmm3, 2 |
| pextrb rax, xmm1, 3 |
| pextrb rax, xmm2, 4 |
| pextrb rax, xmm3, 5 |
| pextrb rax, xmm0, 6 |
| pextrb rax, xmm0, 7 |
| |
| pextrb rax, xmm0, 0 |
| pextrb rax, xmm0, 1 |
| pextrb rax, xmm0, 2 |
| pextrb rax, xmm0, 3 |
| pextrb rax, xmm0, 4 |
| pextrb rax, xmm0, 5 |
| pextrb rax, xmm0, 6 |
| pextrb rax, xmm0, 7 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Vector16ToRegister |
| ; Purpose: Writes 16-bit vector register values into main register. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| Vector16ToRegister: |
| _Vector16ToRegister: |
| sal rdi, 2 ; Force some repetition. |
| .L1: |
| pextrw rax, xmm1, 0 ; 256 byte chunk / 2 bytes/xfer = 128 xfers. |
| pextrw rax, xmm2, 1 |
| pextrw rax, xmm3, 2 |
| pextrw rax, xmm1, 3 |
| pextrw rax, xmm2, 4 |
| pextrw rax, xmm3, 5 |
| pextrw rax, xmm0, 6 |
| pextrw rax, xmm0, 7 |
| |
| pextrw rax, xmm0, 0 |
| pextrw rax, xmm1, 1 |
| pextrw rax, xmm2, 2 |
| pextrw rax, xmm3, 3 |
| pextrw rax, xmm3, 4 |
| pextrw rax, xmm2, 5 |
| pextrw rax, xmm1, 6 |
| pextrw rax, xmm0, 7 |
| |
| pextrw rax, xmm1, 0 |
| pextrw rax, xmm2, 1 |
| pextrw rax, xmm3, 2 |
| pextrw rax, xmm1, 3 |
| pextrw rax, xmm2, 4 |
| pextrw rax, xmm3, 5 |
| pextrw rax, xmm0, 6 |
| pextrw rax, xmm0, 7 |
| |
| pextrw rax, xmm0, 0 |
| pextrw rax, xmm0, 1 |
| pextrw rax, xmm0, 2 |
| pextrw rax, xmm0, 3 |
| pextrw rax, xmm0, 4 |
| pextrw rax, xmm0, 5 |
| pextrw rax, xmm0, 6 |
| pextrw rax, xmm0, 7 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Vector32ToRegister |
| ; Purpose: Writes 32-bit vector register values into main register. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| Vector32ToRegister: |
| _Vector32ToRegister: |
| add rdi, rdi |
| .L1: |
| pextrd eax, xmm1, 0 ; 256 byte chunk / 4 bytes/xfer = 64 xfers. |
| pextrd eax, xmm2, 1 |
| pextrd eax, xmm3, 2 |
| pextrd eax, xmm1, 3 |
| pextrd eax, xmm2, 0 |
| pextrd eax, xmm3, 1 |
| pextrd eax, xmm0, 2 |
| pextrd eax, xmm0, 3 |
| |
| pextrd eax, xmm0, 0 |
| pextrd eax, xmm1, 1 |
| pextrd eax, xmm2, 2 |
| pextrd eax, xmm3, 3 |
| pextrd eax, xmm3, 3 |
| pextrd eax, xmm2, 2 |
| pextrd eax, xmm1, 1 |
| pextrd eax, xmm0, 0 |
| |
| pextrd eax, xmm1, 0 |
| pextrd eax, xmm2, 1 |
| pextrd eax, xmm3, 2 |
| pextrd eax, xmm1, 3 |
| pextrd eax, xmm2, 0 |
| pextrd eax, xmm3, 1 |
| pextrd eax, xmm0, 2 |
| pextrd eax, xmm0, 3 |
| |
| pextrd eax, xmm0, 0 |
| pextrd eax, xmm0, 1 |
| pextrd eax, xmm0, 2 |
| pextrd eax, xmm0, 3 |
| pextrd eax, xmm0, 0 |
| pextrd eax, xmm0, 0 |
| pextrd eax, xmm0, 0 |
| pextrd eax, xmm0, 0 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: Vector64ToRegister |
| ; Purpose: Writes 64-bit vector register values into main register. |
| ; Params: rdi = loops |
| ;------------------------------------------------------------------------------ |
| Vector64ToRegister: |
| _Vector64ToRegister: |
| add rdi, rdi |
| .L1: |
| pextrq rax, xmm1, 0 ; 256 byte chunk / 8 bytes/xfer = 32 xfers. |
| pextrq rax, xmm2, 1 |
| pextrq rax, xmm3, 0 |
| pextrq rax, xmm1, 1 |
| pextrq rax, xmm2, 0 |
| pextrq rax, xmm3, 1 |
| pextrq rax, xmm0, 0 |
| pextrq rax, xmm0, 1 |
| |
| pextrq rax, xmm0, 0 |
| pextrq rax, xmm0, 1 |
| pextrq rax, xmm0, 1 |
| pextrq rax, xmm0, 1 |
| pextrq rax, xmm0, 0 |
| pextrq rax, xmm0, 0 |
| pextrq rax, xmm0, 0 |
| pextrq rax, xmm0, 0 |
| |
| dec rdi |
| jnz .L1 |
| ret |
| |
| ;------------------------------------------------------------------------------ |
| ; Name: CopySSE |
| ; Purpose: Copies memory chunks that are 16-byte aligned. |
| ; Params: rdi = ptr to destination memory area |
| ; rsi = ptr to source memory area |
| ; rdx = length in bytes |
| ; rcx = loops |
| ;------------------------------------------------------------------------------ |
| CopySSE: |
| _CopySSE: |
| push r10 |
| |
| shr rdx, 8 ; Ensure length is multiple of 256. |
| shl rdx, 8 |
| |
| ; Save our non-parameter XMM registers. |
| sub rsp, 192 |
| movdqu [rsp], xmm4 |
| movdqu [16+rsp], xmm5 |
| movdqu [32+rsp], xmm6 |
| movdqu [48+rsp], xmm7 |
| movdqu [64+rsp], xmm8 |
| movdqu [80+rsp], xmm9 |
| movdqu [96+rsp], xmm10 |
| movdqu [112+rsp], xmm11 |
| movdqu [128+rsp], xmm12 |
| movdqu [144+rsp], xmm13 |
| movdqu [160+rsp], xmm14 |
| movdqu [176+rsp], xmm15 |
| |
| .L1: |
| mov r10, rdx |
| |
| .L2: |
| ; prefetchnta [rsi] |
| movdqa xmm0, [rsi] |
| movdqa xmm1, [16+rsi] |
| movdqa xmm2, [32+rsi] |
| movdqa xmm3, [48+rsi] |
| movdqa xmm4, [64+rsi] |
| movdqa xmm5, [80+rsi] |
| movdqa xmm6, [96+rsi] |
| movdqa xmm7, [112+rsi] |
| movdqa xmm8, [128+rsi] |
| movdqa xmm9, [144+rsi] |
| movdqa xmm10, [160+rsi] |
| movdqa xmm11, [176+rsi] |
| movdqa xmm12, [192+rsi] |
| movdqa xmm13, [208+rsi] |
| movdqa xmm14, [224+rsi] |
| movdqa xmm15, [240+rsi] |
| |
| movntdq [rdi], xmm0 |
| movntdq [16+rdi], xmm1 |
| movntdq [32+rdi], xmm2 |
| movntdq [48+rdi], xmm3 |
| movntdq [64+rdi], xmm4 |
| movntdq [80+rdi], xmm5 |
| movntdq [96+rdi], xmm6 |
| movntdq [112+rdi], xmm7 |
| movntdq [128+rdi], xmm8 |
| movntdq [144+rdi], xmm9 |
| movntdq [160+rdi], xmm10 |
| movntdq [176+rdi], xmm11 |
| movntdq [192+rdi], xmm12 |
| movntdq [208+rdi], xmm13 |
| movntdq [224+rdi], xmm14 |
| movntdq [240+rdi], xmm15 |
| |
| add rsi, 256 |
| add rdi, 256 |
| |
| sub r10, 256 |
| jnz .L2 |
| |
| sub rsi, rdx ; rsi now points to start. |
| sub rdi, rdx ; rdi now points to start. |
| |
| dec rcx |
| jnz .L1 |
| |
| movdqu xmm0, [rsp] |
| movdqu xmm1, [16+rsp] |
| movdqu xmm2, [32+rsp] |
| movdqu xmm3, [48+rsp] |
| movdqu xmm4, [64+rsp] |
| movdqu xmm5, [80+rsp] |
| movdqu xmm6, [96+rsp] |
| movdqu xmm7, [112+rsp] |
| movdqu xmm8, [128+rsp] |
| movdqu xmm9, [144+rsp] |
| movdqu xmm10, [160+rsp] |
| movdqu xmm11, [176+rsp] |
| movdqu xmm12, [192+rsp] |
| movdqu xmm13, [208+rsp] |
| movdqu xmm14, [224+rsp] |
| movdqu xmm15, [240+rsp] |
| add rsp, 192 |
| |
| pop r10 |
| |
| ret |
| |