blob: 1749ee4fe88f185622bc7e393c740ba1f57ca19b [file] [log] [blame]
/* compress.c - deflate/inflate code for zip, gzip, zlib, and raw
*
* Copyright 2014 Rob Landley <rob@landley.net>
*
* The inflate/deflate code lives here, so the various things that use it
* either live here or call these commands to pipe data through them.
*
* Divergence from posix: replace obsolete/patented "compress" with mutiplexer.
* (gzip already replaces "uncompress".)
*
* See RFCs 1950 (zlib), 1951 (deflate), and 1952 (gzip)
* LSB 4.1 has gzip, gunzip, and zcat
* TODO: zip -d DIR -x LIST -list -quiet -no overwrite -overwrite -p to stdout
// Accept many different kinds of command line argument.
// Leave Lrg at end so flag values line up.
USE_COMPRESS(NEWTOY(compress, "zcd9lrg[-cd][!zgLr]", TOYFLAG_USR|TOYFLAG_BIN))
USE_GZIP(NEWTOY(gzip, USE_GZIP_D("d")"19dcflqStvgLRz[!gLRz]", TOYFLAG_USR|TOYFLAG_BIN))
USE_ZCAT(NEWTOY(zcat, 0, TOYFLAG_USR|TOYFLAG_BIN))
USE_GUNZIP(NEWTOY(gunzip, "cflqStv", TOYFLAG_USR|TOYFLAG_BIN))
//zip unzip gzip gunzip zcat
config COMPRESS
bool "compress"
default n
help
usage: compress [-zgLR19] [FILE]
Compress or decompress file (or stdin) using "deflate" algorithm.
-1 min compression
-9 max compression (default)
-g gzip (default)
-L zlib
-R raw
-z zip
config GZIP
bool "gzip"
default y
depends on COMPRESS
help
usage: gzip [-19cfqStvzgLR] [FILE...]
Compess (deflate) file(s). With no files, compress stdin to stdout.
On successful decompression, compressed files are replaced with the
uncompressed version. The input file is removed and replaced with
a new file without the .gz extension (with same ownership/permissions).
-1 Minimal compression (fastest)
-9 Max compression (default)
-c cat to stdout (act as zcat)
-f force (if output file exists, input is tty, unrecognized extension)
-q quiet (no warnings)
-S specify exension (default .*)
-t test compressed file(s)
-v verbose (like -l, but compress files)
Compression type:
-g gzip (default) -L zlib -R raw -z zip
config GZIP_D
bool
default y
depends on GZIP && DECOMPRESS
help
usage: gzip [-d]
-d decompress (act as gunzip)
config DECOMPRESS
bool "decompress"
default n
help
usage: compress [-zglrcd9] [FILE]
Compress or decompress file (or stdin) using "deflate" algorithm.
-c compress with -g gzip (default) -l zlib -r raw -z zip
-d decompress (autodetects type)
config ZCAT
bool "zcat"
default y
depends on DECOMPRESS
help
usage: zcat [FILE...]
Decompress deflated file(s) to stdout
config GUNZIP
bool "gunzip"
default y
depends on DECOMPRESS
help
usage: gunzip [-cflqStv] [FILE...]
Decompess (deflate) file(s). With no files, compress stdin to stdout.
On successful decompression, compressed files are replaced with the
uncompressed version. The input file is removed and replaced with
a new file without the .gz extension (with same ownership/permissions).
-c cat to stdout (act as zcat)
-f force (output file exists, input is tty, unrecognized extension)
-l list compressed/uncompressed/ratio/name for each input file.
-q quiet (no warnings)
-S specify exension (default .*)
-t test compressed file(s)
-v verbose (like -l, but decompress files)
*/
#define FOR_compress
#include "toys.h"
GLOBALS(
// Huffman codes: base offset and extra bits tables (length and distance)
char lenbits[29], distbits[30];
unsigned short lenbase[29], distbase[30];
void *fixdisthuff, *fixlithuff;
// CRC
void (*crcfunc)(char *data, int len);
unsigned crc;
// Compressed data buffer
char *data;
unsigned pos, len;
int infd, outfd;
// Tables only used for deflation
unsigned short *hashhead, *hashchain;
)
// little endian bit buffer
struct bitbuf {
int fd, bitpos, len, max;
char buf[];
};
// malloc a struct bitbuf
struct bitbuf *bitbuf_init(int fd, int size)
{
struct bitbuf *bb = xzalloc(sizeof(struct bitbuf)+size);
bb->max = size;
bb->fd = fd;
return bb;
}
// Advance bitpos without the overhead of recording bits
void bitbuf_skip(struct bitbuf *bb, int bits)
{
int pos = bb->bitpos + bits, len = bb->len << 3;
while (pos >= len) {
pos -= len;
len = (bb->len = read(bb->fd, bb->buf, bb->max)) << 3;
if (bb->len < 1) perror_exit("inflate EOF");
}
bb->bitpos = pos;
}
// Optimized single bit inlined version
static inline int bitbuf_bit(struct bitbuf *bb)
{
int bufpos = bb->bitpos>>3;
if (bufpos == bb->len) {
bitbuf_skip(bb, 0);
bufpos = 0;
}
return (bb->buf[bufpos]>>(bb->bitpos++&7))&1;
}
// Fetch the next X bits from the bitbuf, little endian
unsigned bitbuf_get(struct bitbuf *bb, int bits)
{
int result = 0, offset = 0;
while (bits) {
int click = bb->bitpos >> 3, blow, blen;
// Load more data if buffer empty
if (click == bb->len) bitbuf_skip(bb, click = 0);
// grab bits from next byte
blow = bb->bitpos & 7;
blen = 8-blow;
if (blen > bits) blen = bits;
result |= ((bb->buf[click] >> blow) & ((1<<blen)-1)) << offset;
offset += blen;
bits -= blen;
bb->bitpos += blen;
}
return result;
}
void bitbuf_flush(struct bitbuf *bb)
{
if (!bb->bitpos) return;
xwrite(bb->fd, bb->buf, (bb->bitpos+7)/8);
memset(bb->buf, 0, bb->max);
bb->bitpos = 0;
}
void bitbuf_put(struct bitbuf *bb, int data, int len)
{
while (len) {
int click = bb->bitpos >> 3, blow, blen;
// Flush buffer if necessary
if (click == bb->max) {
bitbuf_flush(bb);
click = 0;
}
blow = bb->bitpos & 7;
blen = 8-blow;
if (blen > len) blen = len;
bb->buf[click] |= data << blow;
bb->bitpos += blen;
data >>= blen;
len -= blen;
}
}
static void output_byte(char sym)
{
int pos = TT.pos++ & 32767;
TT.data[pos] = sym;
if (!pos) {
xwrite(TT.outfd, TT.data, 32768);
if (TT.crcfunc) TT.crcfunc(TT.data, 32768);
}
}
// Huffman coding uses bits to traverse a binary tree to a leaf node,
// By placing frequently occurring symbols at shorter paths, frequently
// used symbols may be represented in fewer bits than uncommon symbols.
struct huff {
unsigned short length[16];
unsigned short symbol[288];
};
// Create simple huffman tree from array of bit lengths.
// The symbols in the huffman trees are sorted (first by bit length
// of the code to reach them, then by symbol number). This means that given
// the bit length of each symbol, we can construct a unique tree.
static void len2huff(struct huff *huff, char bitlen[], int len)
{
int offset[16];
int i;
// Count number of codes at each bit length
memset(huff, 0, sizeof(struct huff));
for (i = 0; i<len; i++) huff->length[bitlen[i]]++;
// Sort symbols by bit length. (They'll remain sorted by symbol within that.)
*huff->length = *offset = 0;
for (i = 1; i<16; i++) offset[i] = offset[i-1] + huff->length[i-1];
for (i = 0; i<len; i++) if (bitlen[i]) huff->symbol[offset[bitlen[i]]++] = i;
}
// Fetch and decode next huffman coded symbol from bitbuf.
// This takes advantage of the sorting to navigate the tree as an array:
// each time we fetch a bit we have all the codes at that bit level in
// order with no gaps.
static unsigned huff_and_puff(struct bitbuf *bb, struct huff *huff)
{
unsigned short *length = huff->length;
int start = 0, offset = 0;
// Traverse through the bit lengths until our code is in this range
for (;;) {
offset = (offset << 1) | bitbuf_bit(bb);
start += *++length;
if ((offset -= *length) < 0) break;
if ((length - huff->length) & 16) error_exit("bad symbol");
}
return huff->symbol[start + offset];
}
// Decompress deflated data from bitbuf to TT.outfd.
static void inflate(struct bitbuf *bb)
{
TT.crc = ~0;
// repeat until spanked
for (;;) {
int final, type;
final = bitbuf_get(bb, 1);
type = bitbuf_get(bb, 2);
if (type == 3) error_exit("bad type");
// Uncompressed block?
if (!type) {
int len, nlen;
// Align to byte, read length
bitbuf_skip(bb, (8-bb->bitpos)&7);
len = bitbuf_get(bb, 16);
nlen = bitbuf_get(bb, 16);
if (len != (0xffff & ~nlen)) error_exit("bad len");
// Dump literal output data
while (len) {
int pos = bb->bitpos >> 3, bblen = bb->len - pos;
char *p = bb->buf+pos;
// dump bytes until done or end of current bitbuf contents
if (bblen > len) bblen = len;
pos = bblen;
while (pos--) output_byte(*(p++));
bitbuf_skip(bb, bblen << 3);
len -= bblen;
}
// Compressed block
} else {
struct huff *disthuff, *lithuff;
// Dynamic huffman codes?
if (type == 2) {
struct huff *h2 = ((struct huff *)toybuf)+1;
int i, litlen, distlen, hufflen;
char *hufflen_order = "\x10\x11\x12\0\x08\x07\x09\x06\x0a\x05\x0b"
"\x04\x0c\x03\x0d\x02\x0e\x01\x0f", *bits;
// The huffman trees are stored as a series of bit lengths
litlen = bitbuf_get(bb, 5)+257; // max 288
distlen = bitbuf_get(bb, 5)+1; // max 32
hufflen = bitbuf_get(bb, 4)+4; // max 19
// The literal and distance codes are themselves compressed, in
// a complicated way: an array of bit lengths (hufflen many
// entries, each 3 bits) is used to fill out an array of 19 entries
// in a magic order, leaving the rest 0. Then make a tree out of it:
memset(bits = toybuf+1, 0, 19);
for (i=0; i<hufflen; i++) bits[hufflen_order[i]] = bitbuf_get(bb, 3);
len2huff(h2, bits, 19);
// Use that tree to read in the literal and distance bit lengths
for (i = 0; i < litlen + distlen;) {
int sym = huff_and_puff(bb, h2);
// 0-15 are literals, 16 = repeat previous code 3-6 times,
// 17 = 3-10 zeroes (3 bit), 18 = 11-138 zeroes (7 bit)
if (sym < 16) bits[i++] = sym;
else {
int len = sym & 2;
len = bitbuf_get(bb, sym-14+len+(len>>1)) + 3 + (len<<2);
memset(bits+i, bits[i-1] * !(sym&3), len);
i += len;
}
}
if (i > litlen+distlen) error_exit("bad tree");
len2huff(lithuff = h2, bits, litlen);
len2huff(disthuff = ((struct huff *)toybuf)+2, bits+litlen, distlen);
// Static huffman codes
} else {
lithuff = TT.fixlithuff;
disthuff = TT.fixdisthuff;
}
// Use huffman tables to decode block of compressed symbols
for (;;) {
int sym = huff_and_puff(bb, lithuff);
// Literal?
if (sym < 256) output_byte(sym);
// Copy range?
else if (sym > 256) {
int len, dist;
sym -= 257;
len = TT.lenbase[sym] + bitbuf_get(bb, TT.lenbits[sym]);
sym = huff_and_puff(bb, disthuff);
dist = TT.distbase[sym] + bitbuf_get(bb, TT.distbits[sym]);
sym = TT.pos & 32767;
while (len--) output_byte(TT.data[(TT.pos-dist) & 32767]);
// End of block
} else break;
}
}
// Was that the last block?
if (final) break;
}
if (TT.pos & 32767) {
xwrite(TT.outfd, TT.data, TT.pos & 32767);
if (TT.crcfunc) TT.crcfunc(TT.data, TT.pos & 32767);
}
}
// Deflate from TT.infd to bitbuf
// For deflate, TT.len = input read, TT.pos = input consumed
static void deflate(struct bitbuf *bb)
{
char *data = TT.data;
int len, final = 0;
TT.crc = ~0;
while (!final) {
// Read next half-window of data if we haven't hit EOF yet.
len = readall(TT.infd, data+(TT.len&32768), 32768);
if (len < 0) perror_exit("read"); // todo: add filename
if (len != 32768) final++;
if (TT.crcfunc) TT.crcfunc(data+(TT.len&32768), len);
// TT.len += len; crcfunc advances len
// store block as literal
bitbuf_put(bb, final, 1);
bitbuf_put(bb, 0, 1);
bitbuf_put(bb, 0, (8-bb->bitpos)&7);
bitbuf_put(bb, len, 16);
bitbuf_put(bb, 0xffff & ~len, 16);
// repeat until spanked
while (TT.pos != TT.len) {
unsigned pos = TT.pos & 65535;
bitbuf_put(bb, data[pos], 8);
// need to refill buffer?
if (!(32767 & ++TT.pos) && !final) break;
}
}
bitbuf_flush(bb);
}
// Allocate memory for deflate/inflate.
static void init_deflate(int compress)
{
int i, n = 1;
// compress needs 64k data and 32k each for hashhead and hashchain.
// decompress just needs 32k data.
TT.data = xmalloc(32768*(compress ? 4 : 1));
if (compress) {
TT.hashhead = (unsigned short *)(TT.data + 65536);
TT.hashchain = (unsigned short *)(TT.data + 65536 + 32768);
}
// Calculate lenbits, lenbase, distbits, distbase
*TT.lenbase = 3;
for (i = 0; i<sizeof(TT.lenbits)-1; i++) {
if (i>4) {
if (!(i&3)) {
TT.lenbits[i]++;
n <<= 1;
}
if (i == 27) n--;
else TT.lenbits[i+1] = TT.lenbits[i];
}
TT.lenbase[i+1] = n + TT.lenbase[i];
}
n = 0;
for (i = 0; i<sizeof(TT.distbits); i++) {
TT.distbase[i] = 1<<n;
if (i) TT.distbase[i] += TT.distbase[i-1];
if (i>3 && !(i&1)) n++;
TT.distbits[i] = n;
}
// Init fixed huffman tables
for (i=0; i<288; i++) toybuf[i] = 8 + (i>143) - ((i>255)<<1) + (i>279);
len2huff(TT.fixlithuff = ((struct huff *)toybuf)+3, toybuf, 288);
memset(toybuf, 5, 30);
len2huff(TT.fixdisthuff = ((struct huff *)toybuf)+4, toybuf, 30);
}
// Return true/false whether we consumed a gzip header.
static int is_gzip(struct bitbuf *bb)
{
int flags;
// Confirm signature
if (bitbuf_get(bb, 24) != 0x088b1f || (flags = bitbuf_get(bb, 8)) > 31)
return 0;
bitbuf_skip(bb, 6*8);
// Skip extra, name, comment, header CRC fields
if (flags & 4) bitbuf_skip(bb, 16);
if (flags & 8) while (bitbuf_get(bb, 8));
if (flags & 16) while (bitbuf_get(bb, 8));
if (flags & 2) bitbuf_skip(bb, 16);
return 1;
}
void gzip_crc(char *data, int len)
{
int i;
unsigned crc, *crc_table = (unsigned *)(toybuf+sizeof(toybuf)-1024);
crc = TT.crc;
for (i=0; i<len; i++) crc = crc_table[(crc^data[i])&0xff] ^ (crc>>8);
TT.crc = crc;
TT.len += len;
}
static void do_gzip(int fd, char *name)
{
struct bitbuf *bb = bitbuf_init(1, sizeof(toybuf));
// Header from RFC 1952 section 2.2:
// 2 ID bytes (1F, 8b), gzip method byte (8=deflate), FLAG byte (none),
// 4 byte MTIME (zeroed), Extra Flags (2=maximum compression),
// Operating System (FF=unknown)
TT.infd = fd;
xwrite(bb->fd, "\x1f\x8b\x08\0\0\0\0\0\x02\xff", 10);
// Use last 1k of toybuf for little endian crc table
crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
TT.crcfunc = gzip_crc;
deflate(bb);
// tail: crc32, len32
bitbuf_put(bb, 0, (8-bb->bitpos)&7);
bitbuf_put(bb, ~TT.crc, 32);
bitbuf_put(bb, TT.len, 32);
bitbuf_flush(bb);
free(bb);
}
static void do_zcat(int fd, char *name)
{
struct bitbuf *bb = bitbuf_init(fd, sizeof(toybuf));
if (!is_gzip(bb)) error_exit("not gzip");
TT.outfd = 1;
// Use last 1k of toybuf for little endian crc table
crc_init((unsigned *)(toybuf+sizeof(toybuf)-1024), 1);
TT.crcfunc = gzip_crc;
inflate(bb);
// tail: crc32, len32
bitbuf_skip(bb, (8-bb->bitpos)&7);
if (~TT.crc != bitbuf_get(bb, 32) || TT.len != bitbuf_get(bb, 32))
error_exit("bad crc");
free(bb);
}
// Parse many different kinds of command line argument:
void compress_main(void)
{
// todo: this
printf("hello world");
}
//#define CLEANUP_compress
//#define FOR_zcat
//#include "generated/flags.h"
void zcat_main(void)
{
init_deflate(0);
loopfiles(toys.optargs, do_zcat);
}
void gunzip_main(void)
{
init_deflate(0);
loopfiles(toys.optargs, do_zcat);
}
void gzip_main(void)
{
init_deflate(1);
loopfiles(toys.optargs, do_gzip);
}