| # This Source Code Form is subject to the terms of the Mozilla Public |
| # License, v. 2.0. If a copy of the MPL was not distributed with this |
| # file, You can obtain one at http://mozilla.org/MPL/2.0/. |
| |
| # ** ARCFOUR implementation optimized for AMD64. |
| # ** |
| # ** The throughput achieved by this code is about 320 MBytes/sec, on |
| # ** a 1.8 GHz AMD Opteron (rev C0) processor. |
| |
| .text |
| .align 16 |
| .globl ARCFOUR |
| .type ARCFOUR,@function |
| ARCFOUR: |
| pushq %rbp |
| pushq %rbx |
| movq %rdi, %rbp # key = ARG(key) |
| movq %rsi, %rbx # rbx = ARG(len) |
| movq %rdx, %rsi # in = ARG(in) |
| movq %rcx, %rdi # out = ARG(out) |
| movq (%rbp), %rcx # x = key->x |
| movq 8(%rbp), %rdx # y = key->y |
| addq $16, %rbp # d = key->data |
| incq %rcx # x++ |
| andq $255, %rcx # x &= 0xff |
| leaq -8(%rbx,%rsi), %rbx # rbx = in+len-8 |
| movq %rbx, %r9 # tmp = in+len-8 |
| movq 0(%rbp,%rcx,8), %rax # tx = d[x] |
| cmpq %rsi, %rbx # cmp in with in+len-8 |
| jl .Lend # jump if (in+len-8 < in) |
| |
| .Lstart: |
| addq $8, %rsi # increment in |
| addq $8, %rdi # increment out |
| |
| # generate the next 8 bytes of the rc4 stream into %r8 |
| movq $8, %r11 # byte counter |
| 1: addb %al, %dl # y += tx |
| movl 0(%rbp,%rdx,8), %ebx # ty = d[y] |
| movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty |
| addb %al, %bl # val = ty + tx |
| movl %eax, 0(%rbp,%rdx,8) # d[y] = tx |
| incb %cl # x++ (NEXT ROUND) |
| movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) |
| movb 0(%rbp,%rbx,8), %r8b # val = d[val] |
| decb %r11b |
| rorq $8, %r8 # (ror does not change ZF) |
| jnz 1b |
| |
| # xor 8 bytes |
| xorq -8(%rsi), %r8 |
| cmpq %r9, %rsi # cmp in+len-8 with in |
| movq %r8, -8(%rdi) |
| jle .Lstart # jump if (in <= in+len-8) |
| |
| .Lend: |
| addq $8, %r9 # tmp = in+len |
| |
| # handle the last bytes, one by one |
| 1: cmpq %rsi, %r9 # cmp in with in+len |
| jle .Lfinished # jump if (in+len <= in) |
| addb %al, %dl # y += tx |
| movl 0(%rbp,%rdx,8), %ebx # ty = d[y] |
| movl %ebx, 0(%rbp,%rcx,8) # d[x] = ty |
| addb %al, %bl # val = ty + tx |
| movl %eax, 0(%rbp,%rdx,8) # d[y] = tx |
| incb %cl # x++ (NEXT ROUND) |
| movl 0(%rbp,%rcx,8), %eax # tx = d[x] (NEXT ROUND) |
| movb 0(%rbp,%rbx,8), %r8b # val = d[val] |
| xorb (%rsi), %r8b # xor 1 byte |
| movb %r8b, (%rdi) |
| incq %rsi # in++ |
| incq %rdi # out++ |
| jmp 1b |
| |
| .Lfinished: |
| decq %rcx # x-- |
| movb %dl, -8(%rbp) # key->y = y |
| movb %cl, -16(%rbp) # key->x = x |
| popq %rbx |
| popq %rbp |
| ret |
| .L_ARCFOUR_end: |
| .size ARCFOUR,.L_ARCFOUR_end-ARCFOUR |
| |
| # Magic indicating no need for an executable stack |
| .section .note.GNU-stack,"",@progbits |
| .previous |