| ; vim:filetype=nasm ts=8 |
| |
| ; libFLAC - Free Lossless Audio Codec library |
| ; Copyright (C) 2001,2002,2003,2004,2005,2006,2007 Josh Coalson |
| ; |
| ; Redistribution and use in source and binary forms, with or without |
| ; modification, are permitted provided that the following conditions |
| ; are met: |
| ; |
| ; - Redistributions of source code must retain the above copyright |
| ; notice, this list of conditions and the following disclaimer. |
| ; |
| ; - Redistributions in binary form must reproduce the above copyright |
| ; notice, this list of conditions and the following disclaimer in the |
| ; documentation and/or other materials provided with the distribution. |
| ; |
| ; - Neither the name of the Xiph.org Foundation nor the names of its |
| ; contributors may be used to endorse or promote products derived from |
| ; this software without specific prior written permission. |
| ; |
| ; THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS |
| ; ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT |
| ; LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR |
| ; A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR |
| ; CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, |
| ; EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, |
| ; PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR |
| ; PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF |
| ; LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING |
| ; NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS |
| ; SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. |
| |
| %include "nasm.h" |
| |
| data_section |
| |
| cextern FLAC__crc16_table ; unsigned FLAC__crc16_table[256]; |
| cextern bitreader_read_from_client_ ; FLAC__bool bitreader_read_from_client_(FLAC__BitReader *br); |
| |
| cglobal FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap |
| |
| code_section |
| |
| |
| ; ********************************************************************** |
| ; |
| ; void FLAC__bool FLAC__bitreader_read_rice_signed_block(FLAC__BitReader *br, int vals[], unsigned nvals, unsigned parameter) |
| ; |
| ; Some details like assertions and other checking is performed by the caller. |
| ALIGN 16 |
| cident FLAC__bitreader_read_rice_signed_block_asm_ia32_bswap |
| |
| ;ASSERT(0 != br); |
| ;ASSERT(0 != br->buffer); |
| ; WATCHOUT: code only works if sizeof(brword)==32; we can make things much faster with this assertion |
| ;ASSERT(FLAC__BITS_PER_WORD == 32); |
| ;ASSERT(parameter < 32); |
| ; the above two asserts also guarantee that the binary part never straddles more than 2 words, so we don't have to loop to read it |
| |
| ;; peppered throughout the code at major checkpoints are keys like this as to where things are at that point in time |
| ;; [esp + 16] unsigned parameter |
| ;; [esp + 12] unsigned nvals |
| ;; [esp + 8] int vals[] |
| ;; [esp + 4] FLAC__BitReader *br |
| mov eax, [esp + 12] ; if(nvals == 0) |
| test eax, eax |
| ja .nvals_gt_0 |
| mov eax, 1 ; return true; |
| ret |
| |
| .nvals_gt_0: |
| push ebp |
| push ebx |
| push esi |
| push edi |
| sub esp, 4 |
| ;; [esp + 36] unsigned parameter |
| ;; [esp + 32] unsigned nvals |
| ;; [esp + 28] int vals[] |
| ;; [esp + 24] FLAC__BitReader *br |
| ;; [esp] ucbits |
| mov ebp, [esp + 24] ; ebp <- br == br->buffer |
| mov esi, [ebp + 16] ; esi <- br->consumed_words (aka 'cwords' in the C version) |
| mov ecx, [ebp + 20] ; ecx <- br->consumed_bits (aka 'cbits' in the C version) |
| xor edi, edi ; edi <- 0 'uval' |
| ;; ecx cbits |
| ;; esi cwords |
| ;; edi uval |
| ;; ebp br |
| ;; [ebp] br->buffer |
| ;; [ebp + 8] br->words |
| ;; [ebp + 12] br->bytes |
| ;; [ebp + 16] br->consumed_words |
| ;; [ebp + 20] br->consumed_bits |
| ;; [ebp + 24] br->read_crc |
| ;; [ebp + 28] br->crc16_align |
| |
| ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; |
| mov eax, [ebp + 8] ; eax <- br->words |
| sub eax, esi ; eax <- br->words-cwords |
| shl eax, 2 ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD |
| add eax, [ebp + 12] ; eax <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes |
| shl eax, 3 ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 |
| sub eax, ecx ; eax <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits |
| mov [esp], eax ; ucbits <- eax |
| |
| ALIGN 16 |
| .val_loop: ; while(1) { |
| |
| ; |
| ; read unary part |
| ; |
| .unary_loop: ; while(1) { |
| ;; ecx cbits |
| ;; esi cwords |
| ;; edi uval |
| ;; ebp br |
| cmp esi, [ebp + 8] ; while(cwords < br->words) /* if we've not consumed up to a partial tail word... */ |
| jae near .c1_next1 |
| .c1_loop: ; { |
| mov ebx, [ebp] |
| mov eax, [ebx + 4*esi] ; b = br->buffer[cwords] |
| mov edx, eax ; edx = br->buffer[cwords] (saved for later use) |
| shl eax, cl ; b = br->buffer[cwords] << cbits |
| test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0) |
| jz near .c1_next2 ; if(b) { |
| bsr ebx, eax |
| not ebx |
| and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax) |
| add ecx, ebx ; cbits += i; |
| add edi, ebx ; uval += i; |
| add ecx, byte 1 ; cbits++; /* skip over stop bit */ |
| test ecx, ~31 |
| jz near .break1 ; if(cbits >= FLAC__BITS_PER_WORD) { /* faster way of testing if(cbits == FLAC__BITS_PER_WORD) */ |
| ; crc16_update_word_(br, br->buffer[cwords]); |
| push edi ; [need more registers] |
| bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier |
| mov ecx, [ebp + 28] ; ecx <- br->crc16_align |
| mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) |
| %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE |
| mov edi, _FLAC__crc16_table |
| %else |
| mov edi, FLAC__crc16_table |
| %endif |
| ;; eax (ax) crc a.k.a. br->read_crc |
| ;; ebx (bl) intermediate result index into FLAC__crc16_table[] |
| ;; ecx br->crc16_align |
| ;; edx byteswapped brword to CRC |
| ;; esi cwords |
| ;; edi unsigned FLAC__crc16_table[] |
| ;; ebp br |
| test ecx, ecx ; switch(br->crc16_align) ... |
| jnz .c0b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] |
| .c0b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) |
| movzx ebx, dl |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] |
| .c0b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) |
| movzx ebx, dh |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] |
| shr edx, 16 |
| .c0b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) |
| movzx ebx, dl |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] |
| .c0b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) |
| movzx ebx, dh |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] |
| movzx eax, ax |
| mov [ebp + 24], eax ; br->read_crc <- crc |
| pop edi |
| |
| add esi, byte 1 ; cwords++; |
| xor ecx, ecx ; cbits = 0; |
| ; } |
| jmp near .break1 ; goto break1; |
| ;; this section relocated out of the way for performance |
| .c0b4: |
| mov [ebp + 28], dword 0 ; br->crc16_align <- 0 |
| cmp ecx, 8 |
| je .c0b1 |
| shr edx, 16 |
| cmp ecx, 16 |
| je .c0b2 |
| jmp .c0b3 |
| |
| ;; this section relocated out of the way for performance |
| .c1b4: |
| mov [ebp + 28], dword 0 ; br->crc16_align <- 0 |
| cmp ecx, 8 |
| je .c1b1 |
| shr edx, 16 |
| cmp ecx, 16 |
| je .c1b2 |
| jmp .c1b3 |
| |
| .c1_next2: ; } else { |
| ;; ecx cbits |
| ;; edx current brword 'b' |
| ;; esi cwords |
| ;; edi uval |
| ;; ebp br |
| add edi, 32 |
| sub edi, ecx ; uval += FLAC__BITS_PER_WORD - cbits; |
| ; crc16_update_word_(br, br->buffer[cwords]); |
| push edi ; [need more registers] |
| bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier |
| mov ecx, [ebp + 28] ; ecx <- br->crc16_align |
| mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) |
| %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE |
| mov edi, _FLAC__crc16_table |
| %else |
| mov edi, FLAC__crc16_table |
| %endif |
| ;; eax (ax) crc a.k.a. br->read_crc |
| ;; ebx (bl) intermediate result index into FLAC__crc16_table[] |
| ;; ecx br->crc16_align |
| ;; edx byteswapped brword to CRC |
| ;; esi cwords |
| ;; edi unsigned FLAC__crc16_table[] |
| ;; ebp br |
| test ecx, ecx ; switch(br->crc16_align) ... |
| jnz .c1b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] |
| .c1b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) |
| movzx ebx, dl |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] |
| .c1b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) |
| movzx ebx, dh |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] |
| shr edx, 16 |
| .c1b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) |
| movzx ebx, dl |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] |
| .c1b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) |
| movzx ebx, dh |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] |
| movzx eax, ax |
| mov [ebp + 24], eax ; br->read_crc <- crc |
| pop edi |
| |
| add esi, byte 1 ; cwords++; |
| xor ecx, ecx ; cbits = 0; |
| ; /* didn't find stop bit yet, have to keep going... */ |
| ; } |
| |
| cmp esi, [ebp + 8] ; } while(cwords < br->words) /* if we've not consumed up to a partial tail word... */ |
| jb near .c1_loop |
| |
| .c1_next1: |
| ; at this point we've eaten up all the whole words; have to try |
| ; reading through any tail bytes before calling the read callback. |
| ; this is a repeat of the above logic adjusted for the fact we |
| ; don't have a whole word. note though if the client is feeding |
| ; us data a byte at a time (unlikely), br->consumed_bits may not |
| ; be zero. |
| ;; ecx cbits |
| ;; esi cwords |
| ;; edi uval |
| ;; ebp br |
| mov edx, [ebp + 12] ; edx <- br->bytes |
| test edx, edx |
| jz .read1 ; if(br->bytes) { [NOTE: this case is rare so it doesn't have to be all that fast ] |
| mov ebx, [ebp] |
| shl edx, 3 ; edx <- const unsigned end = br->bytes * 8; |
| mov eax, [ebx + 4*esi] ; b = br->buffer[cwords] |
| xchg edx, ecx ; [edx <- cbits , ecx <- end] |
| mov ebx, 0xffffffff ; ebx <- FLAC__WORD_ALL_ONES |
| shr ebx, cl ; ebx <- FLAC__WORD_ALL_ONES >> end |
| not ebx ; ebx <- ~(FLAC__WORD_ALL_ONES >> end) |
| xchg edx, ecx ; [edx <- end , ecx <- cbits] |
| and eax, ebx ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)); |
| shl eax, cl ; b = (br->buffer[cwords] & ~(FLAC__WORD_ALL_ONES >> end)) << cbits; |
| test eax, eax ; (still have to test since cbits may be 0, thus ZF not updated for shl eax,0) |
| jz .c1_next3 ; if(b) { |
| bsr ebx, eax |
| not ebx |
| and ebx, 31 ; ebx = 'i' = # of leading 0 bits in 'b' (eax) |
| add ecx, ebx ; cbits += i; |
| add edi, ebx ; uval += i; |
| add ecx, byte 1 ; cbits++; /* skip over stop bit */ |
| jmp short .break1 ; goto break1; |
| .c1_next3: ; } else { |
| sub edi, ecx |
| add edi, edx ; uval += end - cbits; |
| add ecx, edx ; cbits += end |
| ; /* didn't find stop bit yet, have to keep going... */ |
| ; } |
| ; } |
| .read1: |
| ; flush registers and read; bitreader_read_from_client_() does |
| ; not touch br->consumed_bits at all but we still need to set |
| ; it in case it fails and we have to return false. |
| ;; ecx cbits |
| ;; esi cwords |
| ;; edi uval |
| ;; ebp br |
| mov [ebp + 16], esi ; br->consumed_words = cwords; |
| mov [ebp + 20], ecx ; br->consumed_bits = cbits; |
| push ecx ; /* save */ |
| push ebp ; /* push br argument */ |
| %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE |
| call _bitreader_read_from_client_ |
| %else |
| call bitreader_read_from_client_ |
| %endif |
| pop edx ; /* discard, unused */ |
| pop ecx ; /* restore */ |
| mov esi, [ebp + 16] ; cwords = br->consumed_words; |
| ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; |
| mov ebx, [ebp + 8] ; ebx <- br->words |
| sub ebx, esi ; ebx <- br->words-cwords |
| shl ebx, 2 ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD |
| add ebx, [ebp + 12] ; ebx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes |
| shl ebx, 3 ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 |
| sub ebx, ecx ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits |
| add ebx, edi ; ebx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits + uval |
| ; + uval to offset our count by the # of unary bits already |
| ; consumed before the read, because we will add these back |
| ; in all at once at break1 |
| mov [esp], ebx ; ucbits <- ebx |
| test eax, eax ; if(!bitreader_read_from_client_(br)) |
| jnz near .unary_loop |
| jmp .end ; return false; /* eax (the return value) is already 0 */ |
| ; } /* end while(1) unary part */ |
| |
| ALIGN 16 |
| .break1: |
| ;; ecx cbits |
| ;; esi cwords |
| ;; edi uval |
| ;; ebp br |
| ;; [esp] ucbits |
| sub [esp], edi ; ucbits -= uval; |
| sub dword [esp], byte 1 ; ucbits--; /* account for stop bit */ |
| |
| ; |
| ; read binary part |
| ; |
| mov ebx, [esp + 36] ; ebx <- parameter |
| test ebx, ebx ; if(parameter) { |
| jz near .break2 |
| .read2: |
| cmp [esp], ebx ; while(ucbits < parameter) { |
| jae .c2_next1 |
| ; flush registers and read; bitreader_read_from_client_() does |
| ; not touch br->consumed_bits at all but we still need to set |
| ; it in case it fails and we have to return false. |
| mov [ebp + 16], esi ; br->consumed_words = cwords; |
| mov [ebp + 20], ecx ; br->consumed_bits = cbits; |
| push ecx ; /* save */ |
| push ebp ; /* push br argument */ |
| %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE |
| call _bitreader_read_from_client_ |
| %else |
| call bitreader_read_from_client_ |
| %endif |
| pop edx ; /* discard, unused */ |
| pop ecx ; /* restore */ |
| mov esi, [ebp + 16] ; cwords = br->consumed_words; |
| ; ucbits = (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits; |
| mov edx, [ebp + 8] ; edx <- br->words |
| sub edx, esi ; edx <- br->words-cwords |
| shl edx, 2 ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD |
| add edx, [ebp + 12] ; edx <- (br->words-cwords)*FLAC__BYTES_PER_WORD + br->bytes |
| shl edx, 3 ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 |
| sub edx, ecx ; edx <- (br->words-cwords)*FLAC__BITS_PER_WORD + br->bytes*8 - cbits |
| mov [esp], edx ; ucbits <- edx |
| test eax, eax ; if(!bitreader_read_from_client_(br)) |
| jnz .read2 |
| jmp .end ; return false; /* eax (the return value) is already 0 */ |
| ; } |
| .c2_next1: |
| ;; ebx parameter |
| ;; ecx cbits |
| ;; esi cwords |
| ;; edi uval |
| ;; ebp br |
| ;; [esp] ucbits |
| cmp esi, [ebp + 8] ; if(cwords < br->words) { /* if we've not consumed up to a partial tail word... */ |
| jae near .c2_next2 |
| test ecx, ecx ; if(cbits) { |
| jz near .c2_next3 ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */ |
| mov eax, 32 |
| mov edx, [ebp] |
| sub eax, ecx ; const unsigned n = FLAC__BITS_PER_WORD - cbits; |
| mov edx, [edx + 4*esi] ; const brword word = br->buffer[cwords]; |
| cmp ebx, eax ; if(parameter < n) { |
| jae .c2_next4 |
| ; uval <<= parameter; |
| ; uval |= (word & (FLAC__WORD_ALL_ONES >> cbits)) >> (n-parameter); |
| shl edx, cl |
| xchg ebx, ecx |
| shld edi, edx, cl |
| add ebx, ecx ; cbits += parameter; |
| xchg ebx, ecx ; ebx <- parameter, ecx <- cbits |
| jmp .break2 ; goto break2; |
| ; } |
| .c2_next4: |
| ; uval <<= n; |
| ; uval |= word & (FLAC__WORD_ALL_ONES >> cbits); |
| %if 1 |
| rol edx, cl ; @@@@@@OPT: may be faster to use rol to save edx so we can restore it for CRC'ing |
| ; @@@@@@OPT: or put parameter in ch instead and free up ebx completely again |
| %else |
| shl edx, cl |
| %endif |
| xchg eax, ecx |
| shld edi, edx, cl |
| xchg eax, ecx |
| %if 1 |
| ror edx, cl ; restored. |
| %else |
| mov edx, [ebp] |
| mov edx, [edx + 4*esi] |
| %endif |
| ; crc16_update_word_(br, br->buffer[cwords]); |
| push edi ; [need more registers] |
| push ebx ; [need more registers] |
| push eax ; [need more registers] |
| bswap edx ; edx = br->buffer[cwords] swapped; now we can CRC the bytes from LSByte to MSByte which makes things much easier |
| mov ecx, [ebp + 28] ; ecx <- br->crc16_align |
| mov eax, [ebp + 24] ; ax <- br->read_crc (a.k.a. crc) |
| %ifdef FLAC__PUBLIC_NEEDS_UNDERSCORE |
| mov edi, _FLAC__crc16_table |
| %else |
| mov edi, FLAC__crc16_table |
| %endif |
| ;; eax (ax) crc a.k.a. br->read_crc |
| ;; ebx (bl) intermediate result index into FLAC__crc16_table[] |
| ;; ecx br->crc16_align |
| ;; edx byteswapped brword to CRC |
| ;; esi cwords |
| ;; edi unsigned FLAC__crc16_table[] |
| ;; ebp br |
| test ecx, ecx ; switch(br->crc16_align) ... |
| jnz .c2b4 ; [br->crc16_align is 0 the vast majority of the time so we optimize the common case] |
| .c2b0: xor dl, ah ; dl <- (crc>>8)^(word>>24) |
| movzx ebx, dl |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word>>24)] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word>>24)] |
| .c2b1: xor dh, ah ; dh <- (crc>>8)^((word>>16)&0xff)) |
| movzx ebx, dh |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>16)&0xff))] |
| shr edx, 16 |
| .c2b2: xor dl, ah ; dl <- (crc>>8)^((word>>8)&0xff)) |
| movzx ebx, dl |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^((word>>8)&0xff))] |
| .c2b3: xor dh, ah ; dh <- (crc>>8)^(word&0xff) |
| movzx ebx, dh |
| mov ecx, [ebx*4 + edi] ; cx <- FLAC__crc16_table[(crc>>8)^(word&0xff)] |
| shl eax, 8 ; ax <- (crc<<8) |
| xor eax, ecx ; crc <- ax <- (crc<<8) ^ FLAC__crc16_table[(crc>>8)^(word&0xff)] |
| movzx eax, ax |
| mov [ebp + 24], eax ; br->read_crc <- crc |
| pop eax |
| pop ebx |
| pop edi |
| add esi, byte 1 ; cwords++; |
| mov ecx, ebx |
| sub ecx, eax ; cbits = parameter - n; |
| jz .break2 ; if(cbits) { /* parameter > n, i.e. if there are still bits left to read, there have to be less than 32 so they will all be in the next word */ |
| ; uval <<= cbits; |
| ; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits)); |
| mov eax, [ebp] |
| mov eax, [eax + 4*esi] |
| shld edi, eax, cl |
| ; } |
| jmp .break2 ; goto break2; |
| |
| ;; this section relocated out of the way for performance |
| .c2b4: |
| mov [ebp + 28], dword 0 ; br->crc16_align <- 0 |
| cmp ecx, 8 |
| je .c2b1 |
| shr edx, 16 |
| cmp ecx, 16 |
| je .c2b2 |
| jmp .c2b3 |
| |
| .c2_next3: ; } else { |
| mov ecx, ebx ; cbits = parameter; |
| ; uval <<= cbits; |
| ; uval |= (br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits)); |
| mov eax, [ebp] |
| mov eax, [eax + 4*esi] |
| shld edi, eax, cl |
| jmp .break2 ; goto break2; |
| ; } |
| .c2_next2: ; } else { |
| ; in this case we're starting our read at a partial tail word; |
| ; the reader has guaranteed that we have at least 'parameter' |
| ; bits available to read, which makes this case simpler. |
| ; uval <<= parameter; |
| ; if(cbits) { |
| ; /* this also works when consumed_bits==0, it's just a little slower than necessary for that case */ |
| ; uval |= (br->buffer[cwords] & (FLAC__WORD_ALL_ONES >> cbits)) >> (FLAC__BITS_PER_WORD-cbits-parameter); |
| ; cbits += parameter; |
| ; goto break2; |
| ; } else { |
| ; cbits = parameter; |
| ; uval |= br->buffer[cwords] >> (FLAC__BITS_PER_WORD-cbits); |
| ; goto break2; |
| ; } |
| ; the above is much shorter in assembly: |
| mov eax, [ebp] |
| mov eax, [eax + 4*esi] ; eax <- br->buffer[cwords] |
| shl eax, cl ; eax <- br->buffer[cwords] << cbits |
| add ecx, ebx ; cbits += parameter |
| xchg ebx, ecx ; ebx <- cbits, ecx <- parameter |
| shld edi, eax, cl ; uval <<= parameter <<< 'parameter' bits of tail word |
| xchg ebx, ecx ; ebx <- parameter, ecx <- cbits |
| ; } |
| ; } |
| .break2: |
| sub [esp], ebx ; ucbits -= parameter; |
| |
| ; |
| ; compose the value |
| ; |
| mov ebx, [esp + 28] ; ebx <- vals |
| mov edx, edi ; edx <- uval |
| and edi, 1 ; edi <- uval & 1 |
| shr edx, 1 ; edx <- uval >> 1 |
| neg edi ; edi <- -(int)(uval & 1) |
| xor edx, edi ; edx <- (uval >> 1 ^ -(int)(uval & 1)) |
| mov [ebx], edx ; *vals <- edx |
| sub dword [esp + 32], byte 1 ; --nvals; |
| jz .finished ; if(nvals == 0) /* jump to finish */ |
| xor edi, edi ; uval = 0; |
| add dword [esp + 28], 4 ; ++vals |
| jmp .val_loop ; } |
| |
| .finished: |
| mov [ebp + 16], esi ; br->consumed_words = cwords; |
| mov [ebp + 20], ecx ; br->consumed_bits = cbits; |
| mov eax, 1 |
| .end: |
| add esp, 4 |
| pop edi |
| pop esi |
| pop ebx |
| pop ebp |
| ret |
| |
| end |
| |
| %ifdef OBJ_FORMAT_elf |
| section .note.GNU-stack noalloc |
| %endif |