blob: 18e8f6e4e4ea33f07974ce5ee4522ae85c8ca7a1 [file]
; ============================================================================
; bandwidth 0.23, a benchmark to estimate memory transfer bandwidth.
; Copyright (C) 2005-2010 by Zack T Smith.
;
; This program is free software; you can redistribute it and/or modify
; it under the terms of the GNU General Public License as published by
; the Free Software Foundation; either version 2 of the License, or
; (at your option) any later version.
;
; This program is distributed in the hope that it will be useful,
; but WITHOUT ANY WARRANTY; without even the implied warranty of
; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
; GNU General Public License for more details.
;
; You should have received a copy of the GNU General Public License
; along with this program; if not, write to the Free Software
; Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
;
; The author may be reached at fbui@comcast.net.
; =============================================================================
bits 64
cpu x64
global Reader
global RandomReader
global ReaderSSE2
global RandomReaderSSE2
global Writer
global RandomWriter
global WriterSSE2
global RandomWriterSSE2
global WriterSSE2_bypass
global RandomWriterSSE2_bypass
global CopySSE
global _CopySSE
global has_sse2
global RegisterToRegister
global RegisterToVector
global VectorToRegister
global VectorToVector
global Register8ToVector
global Register16ToVector
global Register32ToVector
global Register64ToVector
global Vector8ToRegister
global Vector16ToRegister
global Vector32ToRegister
global Vector64ToRegister
global StackReader
global StackWriter
global _Reader
global _RandomReader
global _ReaderSSE2
global _RandomReaderSSE2
global _Writer
global _RandomWriter
global _WriterSSE2
global _RandomWriterSSE2
global _WriterSSE2_bypass
global _RandomWriterSSE2_bypass
global _has_sse2
global _RegisterToRegister
global _RegisterToVector
global _VectorToRegister
global _VectorToVector
global _Register8ToVector
global _Register16ToVector
global _Register32ToVector
global _Register64ToVector
global _Vector8ToRegister
global _Vector16ToRegister
global _Vector32ToRegister
global _Vector64ToRegister
global _StackReader
global _StackWriter
; Note:
; Unix ABI says integer param are put in these registers in this order:
; rdi, rsi, rdx, rcx, r8, r9
section .text
;------------------------------------------------------------------------------
; Name: has_sse2
;
has_sse2:
_has_sse2:
push rbx
push rcx
push rdx
mov rax, 1
cpuid
test rdx, 0x4000000
setnz al
pop rdx
pop rcx
pop rbx
ret
;------------------------------------------------------------------------------
; Name: Reader
; Purpose: Reads 64-bit values sequentially from an area of memory.
; Params: rdi = ptr to memory area
; rsi = length in bytes
; rdx = loops
;------------------------------------------------------------------------------
Reader:
_Reader:
push r10
add rsi, rdi ; rdi now points to end.
.L1:
mov r10, rdi
.L2:
mov rax, [r10]
mov rax, [8+r10]
mov rax, [16+r10]
mov rax, [24+r10]
mov rax, [32+r10]
mov rax, [40+r10]
mov rax, [48+r10]
mov rax, [56+r10]
mov rax, [64+r10]
mov rax, [72+r10]
mov rax, [80+r10]
mov rax, [88+r10]
mov rax, [96+r10]
mov rax, [104+r10]
mov rax, [112+r10]
mov rax, [120+r10]
mov rax, [128+r10]
mov rax, [136+r10]
mov rax, [144+r10]
mov rax, [152+r10]
mov rax, [160+r10]
mov rax, [168+r10]
mov rax, [176+r10]
mov rax, [184+r10]
mov rax, [192+r10]
mov rax, [200+r10]
mov rax, [208+r10]
mov rax, [216+r10]
mov rax, [224+r10]
mov rax, [232+r10]
mov rax, [240+r10]
mov rax, [248+r10]
add r10, 256
cmp r10, rsi
jb .L2
dec rdx
jnz .L1
pop r10
ret
;------------------------------------------------------------------------------
; Name: RandomReader
; Purpose: Reads 64-bit values randomly from an area of memory.
; Params: rdi = ptr to array of chunk pointers
; rsi = # of chunks
; rdx = loops
;------------------------------------------------------------------------------
RandomReader:
_RandomReader:
push r10
push r11
.L1:
xor r11, r11
.L2:
mov r10, [rdi + 8*r11] ; Note, 64-bit pointers.
mov rax, [96+r10]
mov rax, [r10]
mov rax, [120+r10]
mov rax, [184+r10]
mov rax, [160+r10]
mov rax, [176+r10]
mov rax, [112+r10]
mov rax, [80+r10]
mov rax, [32+r10]
mov rax, [128+r10]
mov rax, [88+r10]
mov rax, [40+r10]
mov rax, [48+r10]
mov rax, [72+r10]
mov rax, [200+r10]
mov rax, [24+r10]
mov rax, [152+r10]
mov rax, [16+r10]
mov rax, [248+r10]
mov rax, [56+r10]
mov rax, [240+r10]
mov rax, [208+r10]
mov rax, [104+r10]
mov rax, [216+r10]
mov rax, [136+r10]
mov rax, [232+r10]
mov rax, [64+r10]
mov rax, [224+r10]
mov rax, [144+r10]
mov rax, [192+r10]
mov rax, [8+r10]
mov rax, [168+r10]
inc r11
cmp r11, rsi
jb .L2
dec rdx
jnz .L1
pop r11
pop r10
ret
;------------------------------------------------------------------------------
; Name: RandomReaderSSE2
; Purpose: Reads 128-bit values randomly from an area of memory.
; Params: rdi = ptr to array of chunk pointers
; rsi = # of chunks
; rdx = loops
;------------------------------------------------------------------------------
RandomReaderSSE2:
_RandomReaderSSE2:
push r10
push r11
.L1:
xor r11, r11
.L2:
mov r10, [rdi + 8*r11]
movdqa xmm0, [240+r10]
movdqa xmm0, [128+r10]
movdqa xmm0, [64+r10]
movdqa xmm0, [208+r10]
movdqa xmm0, [112+r10]
movdqa xmm0, [176+r10]
movdqa xmm0, [144+r10]
movdqa xmm0, [r10]
movdqa xmm0, [96+r10]
movdqa xmm0, [16+r10]
movdqa xmm0, [192+r10]
movdqa xmm0, [160+r10]
movdqa xmm0, [32+r10]
movdqa xmm0, [48+r10]
movdqa xmm0, [224+r10]
movdqa xmm0, [80+r10]
inc r11
cmp r11, rsi
jb .L2
dec rdx
jnz .L1
pop r11
pop r10
ret
;------------------------------------------------------------------------------
; Name: RandomWriter
; Purpose: Writes 64-bit values randomly to an area of memory.
; Params: rdi = ptr to array of chunk pointers
; rsi = # of chunks
; rdx = loops
; rcx = datum to write
;------------------------------------------------------------------------------
RandomWriter:
_RandomWriter:
push r10
push r11
.L1:
xor r11, r11
.L2:
mov r10, [rdi + 8*r11] ; Note, 64-bit pointers.
mov [96+r10], rcx
mov [r10], rcx
mov [120+r10], rcx
mov [184+r10], rcx
mov [160+r10], rcx
mov [176+r10], rcx
mov [112+r10], rcx
mov [80+r10], rcx
mov [32+r10], rcx
mov [128+r10], rcx
mov [88+r10], rcx
mov [40+r10], rcx
mov [48+r10], rcx
mov [72+r10], rcx
mov [200+r10], rcx
mov [24+r10], rcx
mov [152+r10], rcx
mov [16+r10], rcx
mov [248+r10], rcx
mov [56+r10], rcx
mov [240+r10], rcx
mov [208+r10], rcx
mov [104+r10], rcx
mov [216+r10], rcx
mov [136+r10], rcx
mov [232+r10], rcx
mov [64+r10], rcx
mov [224+r10], rcx
mov [144+r10], rcx
mov [192+r10], rcx
mov [8+r10], rcx
mov [168+r10], rcx
inc r11
cmp r11, rsi
jb .L2
dec rdx
jnz .L1
pop r11
pop r10
ret
;------------------------------------------------------------------------------
; Name: RandomWriterSSE2
; Purpose: Writes 128-bit values randomly to an area of memory.
; Params: rdi = ptr to array of chunk pointers
; rsi = # of chunks
; rdx = loops
; rcx = datum to write
;------------------------------------------------------------------------------
RandomWriterSSE2:
_RandomWriterSSE2:
push r10
push r11
movq xmm0, rcx ; Create duplicated 128-bit datum
movq xmm1, rcx
pslldq xmm1, 64
por xmm0, xmm1
.L1:
xor r11, r11
.L2:
mov r10, [rdi + 8*r11] ; Note, 64-bit pointers.
movdqa [240+r10], xmm0
movdqa [128+r10], xmm0
movdqa [208+r10], xmm0
movdqa [112+r10], xmm0
movdqa [64+r10], xmm0
movdqa [176+r10], xmm0
movdqa [144+r10], xmm0
movdqa [r10], xmm0
movdqa [96+r10], xmm0
movdqa [16+r10], xmm0
movdqa [192+r10], xmm0
movdqa [160+r10], xmm0
movdqa [32+r10], xmm0
movdqa [48+r10], xmm0
movdqa [224+r10], xmm0
movdqa [80+r10], xmm0
inc r11
cmp r11, rsi
jb .L2
dec rdx
jnz .L1
pop r11
pop r10
ret
;------------------------------------------------------------------------------
; Name: RandomWriterSSE2_bypass
; Purpose: Writes 128-bit values randomly into memory, bypassing caches.
; Params: rdi = ptr to array of chunk pointers
; rsi = # of chunks
; rdx = loops
; rcx = datum to write
;------------------------------------------------------------------------------
RandomWriterSSE2_bypass:
_RandomWriterSSE2_bypass:
push r10
push r11
movq xmm0, rcx ; Create duplicated 128-bit datum
movq xmm1, rcx
pslldq xmm1, 64
por xmm0, xmm1
.L1:
xor r11, r11
.L2:
mov r10, [rdi + 8*r11] ; Note, 64-bit pointers.
movntdq [240+r10], xmm0
movntdq [128+r10], xmm0
movntdq [208+r10], xmm0
movntdq [112+r10], xmm0
movntdq [64+r10], xmm0
movntdq [176+r10], xmm0
movntdq [144+r10], xmm0
movntdq [r10], xmm0
movntdq [96+r10], xmm0
movntdq [16+r10], xmm0
movntdq [192+r10], xmm0
movntdq [160+r10], xmm0
movntdq [32+r10], xmm0
movntdq [48+r10], xmm0
movntdq [224+r10], xmm0
movntdq [80+r10], xmm0
inc r11
cmp r11, rsi
jb .L2
dec rdx
jnz .L1
pop r11
pop r10
ret
;------------------------------------------------------------------------------
; Name: ReaderSSE2
; Purpose: Reads 128-bit values sequentially from an area of memory.
; Params: rdi = ptr to memory area
; rsi = length in bytes
; rdx = loops
;------------------------------------------------------------------------------
ReaderSSE2:
_ReaderSSE2:
push r10
add rsi, rdi ; rsi now points to end.
.L1:
mov r10, rdi
.L2:
movdqa xmm0, [r10] ; Read aligned to 16-byte boundary.
movdqa xmm0, [16+r10]
movdqa xmm0, [32+r10]
movdqa xmm0, [48+r10]
movdqa xmm0, [64+r10]
movdqa xmm0, [80+r10]
movdqa xmm0, [96+r10]
movdqa xmm0, [112+r10]
movdqa xmm0, [128+r10]
movdqa xmm0, [144+r10]
movdqa xmm0, [160+r10]
movdqa xmm0, [176+r10]
movdqa xmm0, [192+r10]
movdqa xmm0, [208+r10]
movdqa xmm0, [224+r10]
movdqa xmm0, [240+r10]
add r10, 256
cmp r10, rsi
jb .L2
dec rdx
jnz .L1
pop r10
ret
;------------------------------------------------------------------------------
; Name: Writer
; Purpose: Writes 64-bit value sequentially to an area of memory.
; Params: rdi = ptr to memory area
; rsi = length in bytes
; rdx = loops
; rcx = quad to write
;------------------------------------------------------------------------------
Writer:
_Writer:
push r10
add rsi, rdi ; rsi now points to end.
.L1:
mov r10, rdi
.L2:
mov [r10], rcx
mov [8+r10], rcx
mov [16+r10], rcx
mov [24+r10], rcx
mov [32+r10], rcx
mov [40+r10], rcx
mov [48+r10], rcx
mov [56+r10], rcx
mov [64+r10], rcx
mov [72+r10], rcx
mov [80+r10], rcx
mov [88+r10], rcx
mov [96+r10], rcx
mov [104+r10], rcx
mov [112+r10], rcx
mov [120+r10], rcx
mov [128+r10], rcx
mov [136+r10], rcx
mov [144+r10], rcx
mov [152+r10], rcx
mov [160+r10], rcx
mov [168+r10], rcx
mov [176+r10], rcx
mov [184+r10], rcx
mov [192+r10], rcx
mov [200+r10], rcx
mov [208+r10], rcx
mov [216+r10], rcx
mov [224+r10], rcx
mov [232+r10], rcx
mov [240+r10], rcx
mov [248+r10], rcx
add r10, 256
cmp r10, rsi
jb .L2
dec rdx
jnz .L1
pop r10
ret
;------------------------------------------------------------------------------
; Name: WriterSSE2
; Purpose: Writes 128-bit value sequentially to an area of memory.
; Params: rdi = ptr to memory area
; rsi = length in bytes
; rdx = loops
; rcx = quad to write
;------------------------------------------------------------------------------
WriterSSE2:
_WriterSSE2:
push r10
add rsi, rdi ; rsi now points to end.
movq xmm0, rcx
.L1:
mov r10, rdi
.L2:
movdqa [r10], xmm0
movdqa [16+r10], xmm0
movdqa [32+r10], xmm0
movdqa [48+r10], xmm0
movdqa [64+r10], xmm0
movdqa [80+r10], xmm0
movdqa [96+r10], xmm0
movdqa [112+r10], xmm0
movdqa [128+r10], xmm0
movdqa [144+r10], xmm0
movdqa [160+r10], xmm0
movdqa [176+r10], xmm0
movdqa [192+r10], xmm0
movdqa [208+r10], xmm0
movdqa [224+r10], xmm0
movdqa [240+r10], xmm0
add r10, 256
cmp r10, rsi
jb .L2
dec rdx
jnz .L1
pop r10
ret
;------------------------------------------------------------------------------
; Name: WriterSSE2_bypass
; Purpose: Writes 128-bit value sequentially to an area of memory.
; Params: rdi = ptr to memory area
; rsi = length in bytes
; rdx = loops
; rcx = quad to write
;------------------------------------------------------------------------------
WriterSSE2_bypass:
_WriterSSE2_bypass:
push r10
add rsi, rdi ; rsi now points to end.
movq xmm0, rcx
.L1:
mov r10, rdi
.L2:
movntdq [r10], xmm0 ; Write bypassing cache.
movntdq [16+r10], xmm0
movntdq [32+r10], xmm0
movntdq [48+r10], xmm0
movntdq [64+r10], xmm0
movntdq [80+r10], xmm0
movntdq [96+r10], xmm0
movntdq [112+r10], xmm0
movntdq [128+r10], xmm0
movntdq [144+r10], xmm0
movntdq [160+r10], xmm0
movntdq [176+r10], xmm0
movntdq [192+r10], xmm0
movntdq [208+r10], xmm0
movntdq [224+r10], xmm0
movntdq [240+r10], xmm0
add r10, 256
cmp r10, rsi
jb .L2
dec rdx
jnz .L1
pop r10
ret
;------------------------------------------------------------------------------
; Name: StackReader
; Purpose: Reads 64-bit values off the stack into registers of
; the main register set, effectively testing L1 cache access
; *and* effective-address calculation speed.
; Params: rdi = loops
;------------------------------------------------------------------------------
StackReader:
_StackReader:
push qword 7000 ; [rsp+48]
push qword 6000 ; [rsp+40]
push qword 5000 ; [rsp+32]
push qword 4000 ; [rsp+24]
push qword 3000 ; [rsp+16]
push qword 2000 ; [rsp+8]
push qword 1000 ; [rsp]
.L1:
mov rax, [rsp]
mov rax, [rsp+16]
mov rax, [rsp+24]
mov rax, [rsp+32]
mov rax, [rsp+80]
mov rax, [rsp+8]
mov rax, [rsp+88]
mov rax, [rsp]
mov rax, [rsp]
mov rax, [rsp+16]
mov rax, [rsp+24]
mov rax, [rsp+32]
mov rax, [rsp+80]
mov rax, [rsp+8]
mov rax, [rsp+88]
mov rax, [rsp]
mov rax, [rsp]
mov rax, [rsp+16]
mov rax, [rsp+24]
mov rax, [rsp+32]
mov rax, [rsp+80]
mov rax, [rsp+8]
mov rax, [rsp+88]
mov rax, [rsp+8]
mov rax, [rsp+8]
mov rax, [rsp+16]
mov rax, [rsp+24]
mov rax, [rsp+32]
mov rax, [rsp+80]
mov rax, [rsp+8]
mov rax, [rsp+88]
mov rax, [rsp+8]
sub rdi, 1
jnz .L1
add rsp, 56
ret
;------------------------------------------------------------------------------
; Name: StackWriter
; Purpose: Writes 64-bit values into the stack from registers of
; the main register set, effectively testing L1 cache access
; *and* effective-address calculation speed.
; Params: rdi = loops
;------------------------------------------------------------------------------
StackWriter:
_StackWriter:
push qword 7000 ; [rsp+88]
push qword 6000 ; [rsp+80]
push qword 5000 ; [rsp+32]
push qword 4000 ; [rsp+24]
push qword 3000 ; [rsp+16]
push qword 2000 ; [rsp+8]
push qword 1000 ; [rsp]
xor rax, rax
.L1:
mov [rsp], rax
mov [rsp+16], rax
mov [rsp+24], rax
mov [rsp+32], rax
mov [rsp+80], rax
mov [rsp+8], rax
mov [rsp+88], rax
mov [rsp], rax
mov [rsp], rax
mov [rsp+16], rax
mov [rsp+24], rax
mov [rsp+32], rax
mov [rsp+80], rax
mov [rsp+8], rax
mov [rsp+88], rax
mov [rsp], rax
mov [rsp], rax
mov [rsp+16], rax
mov [rsp+24], rax
mov [rsp+32], rax
mov [rsp+80], rax
mov [rsp+8], rax
mov [rsp+88], rax
mov [rsp+8], rax
mov [rsp+8], rax
mov [rsp+16], rax
mov [rsp+24], rax
mov [rsp+32], rax
mov [rsp+80], rax
mov [rsp+8], rax
mov [rsp+88], rax
mov [rsp+8], rax
sub rdi, 1
jnz .L1
add rsp, 56
ret
;------------------------------------------------------------------------------
; Name: RegisterToRegister
; Purpose: Reads/writes 64-bit values between registers of
; the main register set.
; Params: rdi = loops
;------------------------------------------------------------------------------
RegisterToRegister:
_RegisterToRegister:
.L1:
mov rax, rbx
mov rax, rcx
mov rax, rdx
mov rax, rsi
mov rax, rdi
mov rax, rbp
mov rax, rsp
mov rax, rbx
mov rax, rbx
mov rax, rcx
mov rax, rdx
mov rax, rsi
mov rax, rdi
mov rax, rbp
mov rax, rsp
mov rax, rbx
mov rax, rbx
mov rax, rcx
mov rax, rdx
mov rax, rsi
mov rax, rdi
mov rax, rbp
mov rax, rsp
mov rax, rbx
mov rax, rbx
mov rax, rcx
mov rax, rdx
mov rax, rsi
mov rax, rdi
mov rax, rbp
mov rax, rsp
mov rax, rbx
sub rdi, 1
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: VectorToVector
; Purpose: Reads/writes 128-bit values between registers of
; the vector register set, in this case XMM.
; (I don't have access to anything with YMM.)
; Params: rdi = loops
;------------------------------------------------------------------------------
VectorToVector:
_VectorToVector:
.L1:
movdqa xmm0, xmm1 ; Each movdqa moves 16 bytes, so we need 16
movdqa xmm0, xmm2 ; moves to transfer a 256 byte chunk.
movdqa xmm0, xmm3
movdqa xmm2, xmm0
movdqa xmm1, xmm2
movdqa xmm2, xmm1
movdqa xmm0, xmm3
movdqa xmm3, xmm1
movdqa xmm3, xmm2
movdqa xmm1, xmm3
movdqa xmm2, xmm1
movdqa xmm0, xmm1
movdqa xmm1, xmm2
movdqa xmm0, xmm1
movdqa xmm0, xmm3
movdqa xmm3, xmm0
sub rdi, 1
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: RegisterToVector
; Purpose: Writes 64-bit main register values into 128-bit vector register
; clearing the upper unused bits.
; Params: rdi = loops
;------------------------------------------------------------------------------
RegisterToVector:
_RegisterToVector:
.L1:
movq xmm1, rax ; Each movq transfers 8 bytes, so we need
movq xmm2, rsi ; 32 transfers to move a 256-byte chunk.
movq xmm3, rbx
movq xmm1, rcx
movq xmm2, rsi
movq xmm3, rsp
movq xmm0, rdi
movq xmm0, rdx
movq xmm0, rax
movq xmm1, rsi
movq xmm2, rbx
movq xmm3, rcx
movq xmm0, rsi
movq xmm3, rsp
movq xmm2, rdi
movq xmm1, rdx
movq xmm0, rax
movq xmm1, rsi
movq xmm2, rbx
movq xmm3, rcx
movq xmm0, rsi
movq xmm3, rsp
movq xmm2, rdi
movq xmm1, rdx
movq xmm0, rax
movq xmm1, rsi
movq xmm2, rbx
movq xmm3, rcx
movq xmm0, rsi
movq xmm3, rsp
movq xmm2, rdi
movq xmm1, rdx
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: VectorToRegister
; Purpose: Writes lower 64 bits of vector register into 64-bit main
; register.
; Params: rdi = loops
;------------------------------------------------------------------------------
VectorToRegister:
_VectorToRegister:
.L1:
movq rax, xmm1
movq rax, xmm2
movq rax, xmm3
movq rax, xmm1
movq rax, xmm2
movq rax, xmm3
movq rax, xmm0
movq rax, xmm0
movq rax, xmm0
movq rax, xmm1
movq rax, xmm2
movq rax, xmm3
movq rax, xmm0
movq rax, xmm3
movq rax, xmm2
movq rax, xmm1
movq rax, xmm0
movq rax, xmm1
movq rax, xmm2
movq rax, xmm3
movq rax, xmm0
movq rax, xmm3
movq rax, xmm2
movq rax, xmm1
movq rax, xmm0
movq rax, xmm1
movq rax, xmm2
movq rax, xmm3
movq rax, xmm0
movq rax, xmm3
movq rax, xmm2
movq rax, xmm1
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: Register8ToVector
; Purpose: Writes 8-bit main register values into 128-bit vector register
; without clearing the unused bits.
; Params: rdi = loops
;------------------------------------------------------------------------------
Register8ToVector:
_Register8ToVector:
sal rdi, 2 ; Force some repetition.
.L1:
pinsrb xmm1, al, 0
pinsrb xmm2, bl, 1
pinsrb xmm3, cl, 2
pinsrb xmm1, dl, 3
pinsrb xmm2, sil, 4
pinsrb xmm3, dil, 5
pinsrb xmm0, bpl, 6
pinsrb xmm0, spl, 7
pinsrb xmm0, al, 0
pinsrb xmm1, bl, 1
pinsrb xmm2, cl, 2
pinsrb xmm3, dl, 3
pinsrb xmm3, al, 4
pinsrb xmm2, bl, 5
pinsrb xmm1, bpl, 6
pinsrb xmm0, spl, 7
pinsrb xmm1, r8b, 0
pinsrb xmm2, r9b, 1
pinsrb xmm3, r10b, 2
pinsrb xmm1, r11b, 3
pinsrb xmm2, r12b, 4
pinsrb xmm3, al, 5
pinsrb xmm0, cl, 6
pinsrb xmm0, bl, 7
pinsrb xmm0, r8b, 0
pinsrb xmm0, r9b, 1
pinsrb xmm0, r10b, 2
pinsrb xmm0, r11b, 3
pinsrb xmm0, r12b, 4
pinsrb xmm0, al, 5
pinsrb xmm0, cl, 6
pinsrb xmm0, bl, 7
pinsrb xmm1, al, 0
pinsrb xmm2, bl, 1
pinsrb xmm3, cl, 2
pinsrb xmm1, dl, 3
pinsrb xmm2, sil, 4
pinsrb xmm3, dil, 5
pinsrb xmm0, bpl, 6
pinsrb xmm0, spl, 7
pinsrb xmm0, al, 10
pinsrb xmm1, bl, 11
pinsrb xmm2, cl, 12
pinsrb xmm3, dl, 13
pinsrb xmm3, dil, 14
pinsrb xmm2, cl, 15
pinsrb xmm1, al, 6
pinsrb xmm0, bpl, 7
pinsrb xmm1, r8b, 10
pinsrb xmm2, r9b, 11
pinsrb xmm3, r10b, 12
pinsrb xmm1, r11b, 13
pinsrb xmm2, r12b, 14
pinsrb xmm3, al, 15
pinsrb xmm0, cl, 6
pinsrb xmm0, bl, 7
pinsrb xmm0, r8b, 9
pinsrb xmm0, r9b, 8
pinsrb xmm0, r10b, 11
pinsrb xmm0, r11b, 3
pinsrb xmm0, r12b, 4
pinsrb xmm0, al, 5
pinsrb xmm0, cl, 6
pinsrb xmm0, bl, 7
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: Register16ToVector
; Purpose: Writes 16-bit main register values into 128-bit vector register
; without clearing the unused bits.
; Params: rdi = loops
;------------------------------------------------------------------------------
Register16ToVector:
_Register16ToVector:
sal rdi, 1 ; Force some repetition.
.L1:
pinsrw xmm1, ax, 0
pinsrw xmm2, bx, 1
pinsrw xmm3, cx, 2
pinsrw xmm1, dx, 3
pinsrw xmm2, si, 4
pinsrw xmm3, di, 5
pinsrw xmm0, bp, 6
pinsrw xmm0, sp, 7
pinsrw xmm0, ax, 0
pinsrw xmm1, bx, 1
pinsrw xmm2, cx, 2
pinsrw xmm3, dx, 3
pinsrw xmm3, si, 4
pinsrw xmm2, di, 5
pinsrw xmm1, bp, 6
pinsrw xmm0, sp, 7
pinsrw xmm1, r8w, 0
pinsrw xmm2, r9w, 1
pinsrw xmm3, r10w, 2
pinsrw xmm1, r11w, 3
pinsrw xmm2, r12w, 4
pinsrw xmm3, ax, 5
pinsrw xmm0, bp, 6
pinsrw xmm0, bx, 7
pinsrw xmm0, r8w, 0
pinsrw xmm0, r9w, 1
pinsrw xmm0, r10w, 2
pinsrw xmm0, r11w, 3
pinsrw xmm0, r12w, 4
pinsrw xmm0, ax, 5
pinsrw xmm0, bp, 6
pinsrw xmm0, bx, 7
pinsrw xmm1, ax, 0
pinsrw xmm2, bx, 1
pinsrw xmm3, cx, 2
pinsrw xmm1, dx, 3
pinsrw xmm2, si, 4
pinsrw xmm3, di, 5
pinsrw xmm0, bp, 6
pinsrw xmm0, sp, 7
pinsrw xmm0, ax, 0
pinsrw xmm1, bx, 1
pinsrw xmm2, cx, 2
pinsrw xmm3, dx, 3
pinsrw xmm3, si, 4
pinsrw xmm2, di, 5
pinsrw xmm1, bp, 6
pinsrw xmm0, sp, 7
pinsrw xmm1, r8w, 0
pinsrw xmm2, r9w, 1
pinsrw xmm3, r10w, 2
pinsrw xmm1, r11w, 3
pinsrw xmm2, r12w, 4
pinsrw xmm3, ax, 5
pinsrw xmm0, bp, 6
pinsrw xmm0, bx, 7
pinsrw xmm0, r8w, 0
pinsrw xmm0, r9w, 1
pinsrw xmm0, r10w, 2
pinsrw xmm0, r11w, 3
pinsrw xmm0, r12w, 4
pinsrw xmm0, ax, 5
pinsrw xmm0, bp, 6
pinsrw xmm0, bx, 7
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: Register32ToVector
; Purpose: Writes 32-bit main register values into 128-bit vector register
; without clearing the unused bits.
; Params: rdi = loops
;------------------------------------------------------------------------------
Register32ToVector:
_Register32ToVector:
.L1:
pinsrd xmm1, eax, 0 ; Each xfer moves 4 bytes so to move 256 bytes
pinsrd xmm2, ebx, 1 ; we need 64 transfers.
pinsrd xmm3, ecx, 2
pinsrd xmm1, edx, 3
pinsrd xmm2, esi, 0
pinsrd xmm3, edi, 1
pinsrd xmm0, ebp, 2
pinsrd xmm0, esp, 3
pinsrd xmm0, eax, 0
pinsrd xmm1, ebx, 1
pinsrd xmm2, ecx, 2
pinsrd xmm3, edx, 3
pinsrd xmm3, esi, 3
pinsrd xmm2, edi, 2
pinsrd xmm1, ebp, 1
pinsrd xmm0, esp, 0
pinsrd xmm1, r8d, 0
pinsrd xmm2, r9d, 1
pinsrd xmm3, r10d, 2
pinsrd xmm1, r11d, 3
pinsrd xmm2, r12d, 0
pinsrd xmm3, eax, 1
pinsrd xmm0, ebp, 2
pinsrd xmm0, ebx, 3
pinsrd xmm0, r8d, 0
pinsrd xmm0, r9d, 1
pinsrd xmm0, r10d, 2
pinsrd xmm0, r11d, 3
pinsrd xmm0, r12d, 0
pinsrd xmm0, eax, 0
pinsrd xmm0, ebp, 0
pinsrd xmm0, ebx, 0
pinsrd xmm1, eax, 0
pinsrd xmm2, ebx, 1
pinsrd xmm3, ecx, 2
pinsrd xmm1, edx, 3
pinsrd xmm2, esi, 0
pinsrd xmm3, edi, 1
pinsrd xmm0, ebp, 2
pinsrd xmm0, esp, 3
pinsrd xmm0, eax, 0
pinsrd xmm1, ebx, 1
pinsrd xmm2, ecx, 2
pinsrd xmm3, edx, 3
pinsrd xmm3, esi, 3
pinsrd xmm2, edi, 2
pinsrd xmm1, ebp, 1
pinsrd xmm0, esp, 0
pinsrd xmm1, r8d, 0
pinsrd xmm2, r9d, 1
pinsrd xmm3, r10d, 2
pinsrd xmm1, r11d, 3
pinsrd xmm2, r12d, 0
pinsrd xmm3, eax, 1
pinsrd xmm0, ebp, 2
pinsrd xmm0, ebx, 3
pinsrd xmm0, r8d, 0
pinsrd xmm0, r9d, 1
pinsrd xmm0, r10d, 2
pinsrd xmm0, r11d, 3
pinsrd xmm0, r12d, 0
pinsrd xmm0, eax, 0
pinsrd xmm0, ebp, 0
pinsrd xmm0, ebx, 0
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: Register64ToVector
; Purpose: Writes 64-bit main register values into 128-bit vector register
; without clearing the unused bits.
; Params: rdi = loops
;------------------------------------------------------------------------------
Register64ToVector:
_Register64ToVector:
add rdi, rdi
.L1:
pinsrq xmm1, r8, 0 ; Each xfer moves 8 bytes, therefore to do
pinsrq xmm2, r9, 1 ; 256 bytes we need 32 transfers.
pinsrq xmm3, r10, 0
pinsrq xmm1, r11, 1
pinsrq xmm2, r12, 0
pinsrq xmm3, rax, 1
pinsrq xmm0, rbp, 0
pinsrq xmm0, rbx, 1
pinsrq xmm0, r8, 0
pinsrq xmm0, r9, 1
pinsrq xmm0, r10, 1
pinsrq xmm0, r11, 1
pinsrq xmm0, r12, 0
pinsrq xmm0, rax, 0
pinsrq xmm0, rbp, 0
pinsrq xmm0, rbx, 0
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: Vector8ToRegister
; Purpose: Writes 8-bit vector register values into main register.
; Params: rdi = loops
;------------------------------------------------------------------------------
Vector8ToRegister:
_Vector8ToRegister:
sal rdi, 3 ; Force some repetition.
.L1:
pextrb rax, xmm1, 0
pextrb rax, xmm2, 1
pextrb rax, xmm3, 2
pextrb rax, xmm1, 3
pextrb rax, xmm2, 4
pextrb rax, xmm3, 5
pextrb rax, xmm0, 6
pextrb rax, xmm0, 7
pextrb rax, xmm0, 0
pextrb rax, xmm1, 1
pextrb rax, xmm2, 2
pextrb rax, xmm3, 3
pextrb rax, xmm3, 4
pextrb rax, xmm2, 5
pextrb rax, xmm1, 6
pextrb rax, xmm0, 7
pextrb rax, xmm1, 0
pextrb rax, xmm2, 1
pextrb rax, xmm3, 2
pextrb rax, xmm1, 3
pextrb rax, xmm2, 4
pextrb rax, xmm3, 5
pextrb rax, xmm0, 6
pextrb rax, xmm0, 7
pextrb rax, xmm0, 0
pextrb rax, xmm0, 1
pextrb rax, xmm0, 2
pextrb rax, xmm0, 3
pextrb rax, xmm0, 4
pextrb rax, xmm0, 5
pextrb rax, xmm0, 6
pextrb rax, xmm0, 7
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: Vector16ToRegister
; Purpose: Writes 16-bit vector register values into main register.
; Params: rdi = loops
;------------------------------------------------------------------------------
Vector16ToRegister:
_Vector16ToRegister:
sal rdi, 2 ; Force some repetition.
.L1:
pextrw rax, xmm1, 0 ; 256 byte chunk / 2 bytes/xfer = 128 xfers.
pextrw rax, xmm2, 1
pextrw rax, xmm3, 2
pextrw rax, xmm1, 3
pextrw rax, xmm2, 4
pextrw rax, xmm3, 5
pextrw rax, xmm0, 6
pextrw rax, xmm0, 7
pextrw rax, xmm0, 0
pextrw rax, xmm1, 1
pextrw rax, xmm2, 2
pextrw rax, xmm3, 3
pextrw rax, xmm3, 4
pextrw rax, xmm2, 5
pextrw rax, xmm1, 6
pextrw rax, xmm0, 7
pextrw rax, xmm1, 0
pextrw rax, xmm2, 1
pextrw rax, xmm3, 2
pextrw rax, xmm1, 3
pextrw rax, xmm2, 4
pextrw rax, xmm3, 5
pextrw rax, xmm0, 6
pextrw rax, xmm0, 7
pextrw rax, xmm0, 0
pextrw rax, xmm0, 1
pextrw rax, xmm0, 2
pextrw rax, xmm0, 3
pextrw rax, xmm0, 4
pextrw rax, xmm0, 5
pextrw rax, xmm0, 6
pextrw rax, xmm0, 7
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: Vector32ToRegister
; Purpose: Writes 32-bit vector register values into main register.
; Params: rdi = loops
;------------------------------------------------------------------------------
Vector32ToRegister:
_Vector32ToRegister:
add rdi, rdi
.L1:
pextrd eax, xmm1, 0 ; 256 byte chunk / 4 bytes/xfer = 64 xfers.
pextrd eax, xmm2, 1
pextrd eax, xmm3, 2
pextrd eax, xmm1, 3
pextrd eax, xmm2, 0
pextrd eax, xmm3, 1
pextrd eax, xmm0, 2
pextrd eax, xmm0, 3
pextrd eax, xmm0, 0
pextrd eax, xmm1, 1
pextrd eax, xmm2, 2
pextrd eax, xmm3, 3
pextrd eax, xmm3, 3
pextrd eax, xmm2, 2
pextrd eax, xmm1, 1
pextrd eax, xmm0, 0
pextrd eax, xmm1, 0
pextrd eax, xmm2, 1
pextrd eax, xmm3, 2
pextrd eax, xmm1, 3
pextrd eax, xmm2, 0
pextrd eax, xmm3, 1
pextrd eax, xmm0, 2
pextrd eax, xmm0, 3
pextrd eax, xmm0, 0
pextrd eax, xmm0, 1
pextrd eax, xmm0, 2
pextrd eax, xmm0, 3
pextrd eax, xmm0, 0
pextrd eax, xmm0, 0
pextrd eax, xmm0, 0
pextrd eax, xmm0, 0
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: Vector64ToRegister
; Purpose: Writes 64-bit vector register values into main register.
; Params: rdi = loops
;------------------------------------------------------------------------------
Vector64ToRegister:
_Vector64ToRegister:
add rdi, rdi
.L1:
pextrq rax, xmm1, 0 ; 256 byte chunk / 8 bytes/xfer = 32 xfers.
pextrq rax, xmm2, 1
pextrq rax, xmm3, 0
pextrq rax, xmm1, 1
pextrq rax, xmm2, 0
pextrq rax, xmm3, 1
pextrq rax, xmm0, 0
pextrq rax, xmm0, 1
pextrq rax, xmm0, 0
pextrq rax, xmm0, 1
pextrq rax, xmm0, 1
pextrq rax, xmm0, 1
pextrq rax, xmm0, 0
pextrq rax, xmm0, 0
pextrq rax, xmm0, 0
pextrq rax, xmm0, 0
dec rdi
jnz .L1
ret
;------------------------------------------------------------------------------
; Name: CopySSE
; Purpose: Copies memory chunks that are 16-byte aligned.
; Params: rdi = ptr to destination memory area
; rsi = ptr to source memory area
; rdx = length in bytes
; rcx = loops
;------------------------------------------------------------------------------
CopySSE:
_CopySSE:
push r10
shr rdx, 8 ; Ensure length is multiple of 256.
shl rdx, 8
; Save our non-parameter XMM registers.
sub rsp, 192
movdqu [rsp], xmm4
movdqu [16+rsp], xmm5
movdqu [32+rsp], xmm6
movdqu [48+rsp], xmm7
movdqu [64+rsp], xmm8
movdqu [80+rsp], xmm9
movdqu [96+rsp], xmm10
movdqu [112+rsp], xmm11
movdqu [128+rsp], xmm12
movdqu [144+rsp], xmm13
movdqu [160+rsp], xmm14
movdqu [176+rsp], xmm15
.L1:
mov r10, rdx
.L2:
; prefetchnta [rsi]
movdqa xmm0, [rsi]
movdqa xmm1, [16+rsi]
movdqa xmm2, [32+rsi]
movdqa xmm3, [48+rsi]
movdqa xmm4, [64+rsi]
movdqa xmm5, [80+rsi]
movdqa xmm6, [96+rsi]
movdqa xmm7, [112+rsi]
movdqa xmm8, [128+rsi]
movdqa xmm9, [144+rsi]
movdqa xmm10, [160+rsi]
movdqa xmm11, [176+rsi]
movdqa xmm12, [192+rsi]
movdqa xmm13, [208+rsi]
movdqa xmm14, [224+rsi]
movdqa xmm15, [240+rsi]
movntdq [rdi], xmm0
movntdq [16+rdi], xmm1
movntdq [32+rdi], xmm2
movntdq [48+rdi], xmm3
movntdq [64+rdi], xmm4
movntdq [80+rdi], xmm5
movntdq [96+rdi], xmm6
movntdq [112+rdi], xmm7
movntdq [128+rdi], xmm8
movntdq [144+rdi], xmm9
movntdq [160+rdi], xmm10
movntdq [176+rdi], xmm11
movntdq [192+rdi], xmm12
movntdq [208+rdi], xmm13
movntdq [224+rdi], xmm14
movntdq [240+rdi], xmm15
add rsi, 256
add rdi, 256
sub r10, 256
jnz .L2
sub rsi, rdx ; rsi now points to start.
sub rdi, rdx ; rdi now points to start.
dec rcx
jnz .L1
movdqu xmm0, [rsp]
movdqu xmm1, [16+rsp]
movdqu xmm2, [32+rsp]
movdqu xmm3, [48+rsp]
movdqu xmm4, [64+rsp]
movdqu xmm5, [80+rsp]
movdqu xmm6, [96+rsp]
movdqu xmm7, [112+rsp]
movdqu xmm8, [128+rsp]
movdqu xmm9, [144+rsp]
movdqu xmm10, [160+rsp]
movdqu xmm11, [176+rsp]
movdqu xmm12, [192+rsp]
movdqu xmm13, [208+rsp]
movdqu xmm14, [224+rsp]
movdqu xmm15, [240+rsp]
add rsp, 192
pop r10
ret