
/*
 * Dated January 27th, 1998
 *
 * My implementation of the Burrows-Wheeler Transform,
 * coded from scrap, using ideas from various sources, including:
 *   Mike Burrows + David Wheeler                    (the BWT itself)
 *   Michael Schindler      (shifting whole bytes in the arith coder)
 *   Peter Fenwick                             (the structured model)
 *
 * Copyright (C) 1998 by Willem Monsuwe
 *    Faculteit Informatica, Technische Universiteit Eindhoven
 *    email: willem@stack.nl
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
 *
 * The GNU General Public License is contained in the file LICENSE.
 *
 * DISCLAIMER:
 *    I TAKE NO RESPONSIBILITY FOR ANY LOSS OF DATA ARISING FROM THE
 *    USE OF THIS PROGRAM, HOWSOEVER CAUSED.
 *
 * Programmer's notes:
 * - Compile with: gcc -O2 -fomit-frame-pointer -o bwc bwc.c
 *   (-m386 on an intel.  Makes it smaller, and a tad bit faster)
 * - I optimized this code for speed on linux-i486 compiling with GCC 2.7.2
 *   This should explain the many strange constructions.
 *   However, this might make it less optimal for other compilers
 *   or processors.  I'm interested to hear about this.
 * - I seem to get better speed if I compile with -m386 (on a 486)
 *   which seems odd.  I would like to hear more about this too.
 *   (I certainly get smaller code)
 * - There are some spots where the compiler produces sub-optimal code.
 *   (The ones I found are labeled with 'ASM')
 *   However, putting asm() statements there seems to screw up register
 *   allocation.  If you want that extra 1% of speed I suggest editing
 *   the assembler output.
 */

#include <stdio.h>
#include <stdlib.h>
#include <assert.h>
#include <signal.h>
#include <errno.h>
#include <string.h>

/* Configure stuff */

/* Default extension for bwc compressed files */
#define BWC_EXT ".bc"

/* Default block size: compression uses BLOCK_SIZE/2 + 0.25 megabytes */
#define DEF_BLOCK_SIZE 14	/* Default uses 7.25 megs */

/* It really helps speed if your compiler can inline functions */
#ifdef __GNUC__
#define inline __inline__
#else
#define inline   /**/
#endif   

typedef unsigned char	byte;	/* 8-bit unsigned integer */
typedef short		bool;	/* anything that holds 0 and 1 */

/* Generated file for u32, u16, u32_8 (byte order stuff) */
#include "u32_16.h"
#include "u32_8.h"

#if defined(__unix__) || defined(unix)

#include <utime.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/times.h>

#define PATH_SC '/'

#elif defined(__amiga__) || defined(amiga)
/* I don't know if these includes are OK for every amiga.. */
#include <utime.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/times.h>

#define PATH_SC '/'
#else  /* unix */
/* Add your machine/OS here */
#warning There is no machine-dependent section for your machine (yet)
#warning Trying to compile with some defaults..

#include <utime.h>
#include <unistd.h>
#include <sys/stat.h>
#include <sys/times.h>

#define PATH_SC '/'

#endif /* unix */

/* Configurable stuff ends here */

/* Times (1 << BLOCK_BITS) is times 64k */
#define BLOCK_BITS 16

#ifdef HAVE_32_BIT_POINTERS
typedef u32_8 *p32_8;
#define ref(p) (p)
#define ref_u32(p) ((u32 *)p)
#define deref(p) (p)
#else
/* If we have 64-bit pointers, use array indices instead (slower) */
typedef u32 p32_8;
#define ref(p) (sB + (p))
#define ref_u32(p) ((u32 *)sB + (p))
#define deref(p) ((p) - sB)
#endif

/* big memory blocks for decompression */
static u32_8 *vecB = NULL;	/* detransform vector */

/* big memory blocks for compression */
static byte *inB;	/* The input goes here (note: same mem as ptrB) */
static u32_8 *sB;	/* The striped data goes here */
static p32_8 *ptrB;	/* The pointers go here */
static u32 *rdxB;	/* 256k for radix stuff */

/* Lower 24 bits of u32_8 value */
static inline u32
low24(u32_8 x)
{
	return (x.l & ((1 << 24) - 1));
}

/* Commandline flags */
static struct {
	unsigned compressing	:1;	/* -d: Are we compressing ? */
	unsigned keep_input	:1;	/* -k: Keep input files */
	unsigned to_stdout	:1;	/* -c: Output to stdout */
	unsigned verbose	:1;	/* -v: Output statistics */
	unsigned unbwz		:1;	/* Program called as 'unbwz' */
	unsigned test		:1;	/* -t: Test BWC integrity */
	unsigned error		:1;	/* An error occured */
} Flg;

/* Proto's */
static void input_error(void);
static void output_error(void);
static void block_error(void);
static void magic_error(void);
static void cmdline_error(void);
static void terminal_error(char *);

static char *myName, *inFileName, *outFileName;

/* First, the arith coder.  */

typedef struct {
	u32_8 bt;	/* bottom of range */
	u32_8 sz;	/* size of range */
	u32 td;		/* todo (0xff bytes) */
	FILE *io;	/* input or output */
	byte b;		/* byte to go out */
} aStream;

/* Starting size of range */
#define A_START_SIZE 0x01000000L

/* Start the arith decoder (decompress) */
static inline void
start_decoder(FILE *in, aStream *as)
{
	as->sz.l = A_START_SIZE;
	as->io = in;
	as->bt.b.b4 = 0;
	as->bt.b.b3 = getc(in);
	as->bt.b.b2 = getc(in);
	as->bt.b.b1 = getc(in);
}

/* get a symbol from the arith decoder (decompress) */
static inline u32
get_symbol(u32 *fq, u32 ft, aStream *as)
{
	u32 s;
	u32 dv = as->sz.l / ft;
	u32 df = as->sz.l % ft;

	if (as->bt.l < df) {
		as->sz.l = fq[0] * dv + df;
		s = 0;
	} else {
		u32 t, h;
		t = (as->bt.l - df) / dv;
		s = 0;
		h = fq[0];
		if (h > t) {
			as->sz.l = fq[0] * dv + df;
		} else {
			do h += fq[++s]; while (h <= t);
			h -= fq[s];
			as->sz.l = fq[s] * dv;
			as->bt.l -= (h * dv + df);
		}
	}
	while (!as->sz.b.b3) {
		as->sz.l <<= 8;
		as->bt.l <<= 8;
		as->bt.b.b1 = getc(as->io);
	}
	return s;
}

/* Get a non-modeled byte from the arith decoder (decompress) */
static inline byte
get_byte(aStream *as)
{
	u32 s;
	u32 dv = as->sz.l / 0x100;
	u32 df = as->sz.l % 0x100;

	if (as->bt.l < df) {
		as->sz.l = dv + df;
		s = 0;
	} else {
		s = (as->bt.l - df) / dv;
		if (s < 1) {
			as->sz.l = dv + df;
		} else {
			as->sz.l = dv;
			as->bt.l -= (s * dv + df);
		}
	}
	if (!as->sz.b.b3) {
		as->sz.l <<= 8;
		as->bt.l <<= 8;
		as->bt.b.b1 = getc(as->io);
	}
	return (byte)s;
}

/* Get a non-modeled u32 from the arith decoder (decompress) */
static inline u32
get_u32(aStream *as)
{
	u32_8 s;

	s.b.b1 = get_byte(as);
	s.b.b2 = get_byte(as);
	s.b.b3 = get_byte(as);
	s.b.b4 = get_byte(as);
	return s.l;
}

/* Start the arith coder (compress)
 * Because one byte is 'remembered' before it is output,
 * (Because of a possible carry) I put the last byte (b)
 * of the 'magic' signature in the 'remember' buffer.
 */
static inline void
start_coder(FILE *out, byte b, aStream *as)
{
	as->bt.l = 0;
	as->sz.l = A_START_SIZE;
	as->td = 0;
	as->io = out;
	as->b = b;
}

/* Overflow: b and todo bytes are now known */
static inline void
shift_carry(aStream *as)
{
	u32 td = as->td;
	if (td) {
		putc((byte)(as->b + 1), as->io);
		while (--td)
			putc(0x00, as->io);
		as->b = 0;
		as->td = 0;
	} else {
		++as->b;
	}
}

/* Shift out 'known' bytes */
static inline void
shift_byte(aStream *as)
{
	while (!as->sz.b.b3) {
		if (as->bt.b.b3 == 0xff) {  /* Would propagate carry */
			++(as->td);
		} else {
			u32 td;
			putc(as->b, as->io);
			td = as->td;
			if (td > 0) {
				do putc(0xff, as->io); while (--td);
				as->td = 0;
			}
			as->b = as->bt.b.b3;
		}
		as->sz.l <<= 8;
		as->bt.l <<= 8;
		as->bt.b.b4 = 0;
	}
}

/* Put symbol #s through the arith coder (compress) */
static inline void
send_symbol(u32 s, u32 *fq, u32 ft, aStream *as)
{
	if (s) {
		u32 dv = as->sz.l / ft;

		as->bt.l += (as->sz.l % ft);
		as->sz.l = fq[s] * dv;
		{
			register u32 sbt = fq[0];

			/* ASM: compiler could not put sbt in register */
			while (--s)
				sbt += fq[s];
			as->bt.l += sbt * dv;
		}
		if (as->bt.b.b4) {
			shift_carry(as);
			as->bt.b.b4 = 0;
		}
	} else {
		as->sz.l = (fq[0] * (as->sz.l / ft)) + (as->sz.l % ft);
	}
	shift_byte(as);
}

/* Put non-modeled byte s through the arith coder (compress) */
static inline void
send_byte(u32 s, aStream *as)
{
	if (s) {
		as->bt.l += (as->sz.l % 0x100);
		as->sz.l = as->sz.l / 0x100;
		as->bt.l += s * as->sz.l;
		if (as->bt.b.b4) {
			shift_carry(as);
			as->bt.b.b4 = 0;
		}
	} else {
		as->sz.l = ((as->sz.l / 0x100)) + (as->sz.l % 0x100);
	}
	shift_byte(as);
}

/* Put a non-modeled u32 through the coder (compress) */
static inline void
send_u32(u32 s, aStream *as)
{
	send_byte(((u32_8 *)&s)->b.b1, as);
	send_byte(((u32_8 *)&s)->b.b2, as);
	send_byte(((u32_8 *)&s)->b.b3, as);
	send_byte(((u32_8 *)&s)->b.b4, as);
}

/* output the last few bytes (compress) */
static inline bool
end_coder(aStream *as)
{
	putc(as->b, as->io);
	if (as->td > 0) {
		u32 td = as->td;
		do putc(0xff, as->io); while (--td);
	}
	putc(as->bt.b.b3, as->io);
	putc(as->bt.b.b2, as->io);
	putc(as->bt.b.b1, as->io);
	if (ferror(as->io)) {
		output_error();
		return 1;
	}
	return 0;
}

/* The structured model */

/* Normal values are from 1-255 (s+1) */
#define AV_EOB 0x100	/* EndOfBlock */

/* model 80_FF with EOB tacked on the end */
#define MAX_SYMBOLS 129

typedef struct {
	u32 ft;		/* frequency total */
	u32 in;		/* increase by this */
	u32 fq[MAX_SYMBOLS];	/* frequencies */
} Model;

static Model aModels[8];

/* Hack: two values.  Use only in function call to
 * init_model, mget_symbol or model_symbol */
#define AM_BASE  (&aModels[0]),10
#define AM_02_03 (&aModels[1]),2
#define AM_04_07 (&aModels[2]),4
#define AM_08_0F (&aModels[3]),8
#define AM_10_1F (&aModels[4]),16
#define AM_20_3F (&aModels[5]),32
#define AM_40_7F (&aModels[6]),64
#define AM_80_FF (&aModels[7]),129

#define AVB_ONE   0
#define AVB_R1    1
#define AVB_R2    2
#define AVB_02_03 3
#define AVB_04_07 4
#define AVB_08_0F 5
#define AVB_10_1F 6
#define AVB_20_3F 7
#define AVB_40_7F 8
#define AVB_80_FF 9

static inline void
init_model(Model *m, u32 ns, u32 in)
{
	m->ft = ns * in;
	m->in = in;
	do m->fq[ns - 1] = in; while (--ns);
}

#define SCALE_BIT 12

/* Initialise the models */
static void
init_models(void)
{
	init_model(AM_BASE, 25);
	init_model(AM_02_03, 8);
	init_model(AM_04_07, 7);
	init_model(AM_08_0F, 6);
	init_model(AM_10_1F, 5);
	init_model(AM_20_3F, 4);
	init_model(AM_40_7F, 3);
	init_model(AM_80_FF, 2);
}

/* Update model m with symbol s (compress/decompress)*/
static inline void
update_model(u32 s, Model *m, u32 ns)
{
	m->ft += m->in;
	m->fq[s] += m->in;
	if (m->ft & (1 << SCALE_BIT)) {
		u32 j = 0, *i = &(m->fq[ns - 1]);
		do {
			*i = (*i >> 1) + 1;
			j += *i;
			i--;
		} while (--ns);
		m->ft = j;
	}
}

/* Get a symbol through model m (decompress) */
static inline u32
mget_symbol(Model *m, u32 ns, aStream *as)
{
	u32 s = get_symbol(m->fq, m->ft, as);
	update_model(s, m, ns);
	return s;
}

/* Read in a block through the structured model and un-MTF it.
 * Return the size. */
static u32
mtf_get_block(byte *bl, byte *lim, aStream *as)
{
	byte mt[0x100];
	byte *b = bl;

	{
		u32 i = 0xff;
		do mt[i] = (byte)i; while (--i);
		mt[0] = 0x00;
	}
	for(;;) {
		u32 ns, i = 0;
	runlen_label:	/* Get RLE bits */
		switch (mget_symbol(AM_BASE, as)) {
		case AVB_R1:
			i += i + 1;
			goto runlen_label;
		case AVB_R2:
			i += i + 2;
			goto runlen_label;
		case AVB_02_03:
			ns = mget_symbol(AM_02_03, as) + 0x02;
			break;
		case AVB_04_07:
			ns = mget_symbol(AM_04_07, as) + 0x04;
			break;
		case AVB_08_0F:
			ns = mget_symbol(AM_08_0F, as) + 0x08;
			break;
		case AVB_10_1F:
			ns = mget_symbol(AM_10_1F, as) + 0x10;
			break;
		case AVB_20_3F:
			ns = mget_symbol(AM_20_3F, as) + 0x20;
			break;
		case AVB_40_7F:
			ns = mget_symbol(AM_40_7F, as) + 0x40;
			break;
		case AVB_80_FF:
			ns = mget_symbol(AM_80_FF, as) + 0x80;
			break;
		default:
			ns = 1;
		}
		{
			register u32 j = i;	/* Force into register */
			if (j) {
				byte *q = b;

				if ((b += j) > lim)
					return 0;
				/* asm should be faster, but screws up optimization */
				memset(q, mt[0], j);
			}
		}
		if (ns == AV_EOB)
			break;
		if (b >= lim)
			return 0;
		*b = mt[ns];
		memmove(mt + 1, mt, ns);
		mt[0] = *b++;
	}
	return (b - bl);
}

/* CRC32 stuff */
static u32 crcTable[256] = {
	0x00000000L,	0x77073096L,	0xee0e612cL,	0x990951baL,
	0x076dc419L,	0x706af48fL,	0xe963a535L,	0x9e6495a3L,
	0x0edb8832L,	0x79dcb8a4L,	0xe0d5e91eL,	0x97d2d988L,
	0x09b64c2bL,	0x7eb17cbdL,	0xe7b82d07L,	0x90bf1d91L,
	0x1db71064L,	0x6ab020f2L,	0xf3b97148L,	0x84be41deL,
	0x1adad47dL,	0x6ddde4ebL,	0xf4d4b551L,	0x83d385c7L,
	0x136c9856L,	0x646ba8c0L,	0xfd62f97aL,	0x8a65c9ecL,
	0x14015c4fL,	0x63066cd9L,	0xfa0f3d63L,	0x8d080df5L,
	0x3b6e20c8L,	0x4c69105eL,	0xd56041e4L,	0xa2677172L,
	0x3c03e4d1L,	0x4b04d447L,	0xd20d85fdL,	0xa50ab56bL,
	0x35b5a8faL,	0x42b2986cL,	0xdbbbc9d6L,	0xacbcf940L,
	0x32d86ce3L,	0x45df5c75L,	0xdcd60dcfL,	0xabd13d59L,
	0x26d930acL,	0x51de003aL,	0xc8d75180L,	0xbfd06116L,
	0x21b4f4b5L,	0x56b3c423L,	0xcfba9599L,	0xb8bda50fL,
	0x2802b89eL,	0x5f058808L,	0xc60cd9b2L,	0xb10be924L,
	0x2f6f7c87L,	0x58684c11L,	0xc1611dabL,	0xb6662d3dL,
	0x76dc4190L,	0x01db7106L,	0x98d220bcL,	0xefd5102aL,
	0x71b18589L,	0x06b6b51fL,	0x9fbfe4a5L,	0xe8b8d433L,
	0x7807c9a2L,	0x0f00f934L,	0x9609a88eL,	0xe10e9818L,
	0x7f6a0dbbL,	0x086d3d2dL,	0x91646c97L,	0xe6635c01L,
	0x6b6b51f4L,	0x1c6c6162L,	0x856530d8L,	0xf262004eL,
	0x6c0695edL,	0x1b01a57bL,	0x8208f4c1L,	0xf50fc457L,
	0x65b0d9c6L,	0x12b7e950L,	0x8bbeb8eaL,	0xfcb9887cL,
	0x62dd1ddfL,	0x15da2d49L,	0x8cd37cf3L,	0xfbd44c65L,
	0x4db26158L,	0x3ab551ceL,	0xa3bc0074L,	0xd4bb30e2L,
	0x4adfa541L,	0x3dd895d7L,	0xa4d1c46dL,	0xd3d6f4fbL,
	0x4369e96aL,	0x346ed9fcL,	0xad678846L,	0xda60b8d0L,
	0x44042d73L,	0x33031de5L,	0xaa0a4c5fL,	0xdd0d7cc9L,
	0x5005713cL,	0x270241aaL,	0xbe0b1010L,	0xc90c2086L,
	0x5768b525L,	0x206f85b3L,	0xb966d409L,	0xce61e49fL,
	0x5edef90eL,	0x29d9c998L,	0xb0d09822L,	0xc7d7a8b4L,
	0x59b33d17L,	0x2eb40d81L,	0xb7bd5c3bL,	0xc0ba6cadL,
	0xedb88320L,	0x9abfb3b6L,	0x03b6e20cL,	0x74b1d29aL,
	0xead54739L,	0x9dd277afL,	0x04db2615L,	0x73dc1683L,
	0xe3630b12L,	0x94643b84L,	0x0d6d6a3eL,	0x7a6a5aa8L,
	0xe40ecf0bL,	0x9309ff9dL,	0x0a00ae27L,	0x7d079eb1L,
	0xf00f9344L,	0x8708a3d2L,	0x1e01f268L,	0x6906c2feL,
	0xf762575dL,	0x806567cbL,	0x196c3671L,	0x6e6b06e7L,
	0xfed41b76L,	0x89d32be0L,	0x10da7a5aL,	0x67dd4accL,
	0xf9b9df6fL,	0x8ebeeff9L,	0x17b7be43L,	0x60b08ed5L,
	0xd6d6a3e8L,	0xa1d1937eL,	0x38d8c2c4L,	0x4fdff252L,
	0xd1bb67f1L,	0xa6bc5767L,	0x3fb506ddL,	0x48b2364bL,
	0xd80d2bdaL,	0xaf0a1b4cL,	0x36034af6L,	0x41047a60L,
	0xdf60efc3L,	0xa867df55L,	0x316e8eefL,	0x4669be79L,
	0xcb61b38cL,	0xbc66831aL,	0x256fd2a0L,	0x5268e236L,
	0xcc0c7795L,	0xbb0b4703L,	0x220216b9L,	0x5505262fL,
	0xc5ba3bbeL,	0xb2bd0b28L,	0x2bb45a92L,	0x5cb36a04L,
	0xc2d7ffa7L,	0xb5d0cf31L,	0x2cd99e8bL,	0x5bdeae1dL,
	0x9b64c2b0L,	0xec63f226L,	0x756aa39cL,	0x026d930aL,
	0x9c0906a9L,	0xeb0e363fL,	0x72076785L,	0x05005713L,
	0x95bf4a82L,	0xe2b87a14L,	0x7bb12baeL,	0x0cb61b38L,
	0x92d28e9bL,	0xe5d5be0dL,	0x7cdcefb7L,	0x0bdbdf21L,
	0x86d3d2d4L,	0xf1d4e242L,	0x68ddb3f8L,	0x1fda836eL,
	0x81be16cdL,	0xf6b9265bL,	0x6fb077e1L,	0x18b74777L,
	0x88085ae6L,	0xff0f6a70L,	0x66063bcaL,	0x11010b5cL,
	0x8f659effL,	0xf862ae69L,	0x616bffd3L,	0x166ccf45L,
	0xa00ae278L,	0xd70dd2eeL,	0x4e048354L,	0x3903b3c2L,
	0xa7672661L,	0xd06016f7L,	0x4969474dL,	0x3e6e77dbL,
	0xaed16a4aL,	0xd9d65adcL,	0x40df0b66L,	0x37d83bf0L,
	0xa9bcae53L,	0xdebb9ec5L,	0x47b2cf7fL,	0x30b5ffe9L,
	0xbdbdf21cL,	0xcabac28aL,	0x53b39330L,	0x24b4a3a6L,
	0xbad03605L,	0xcdd70693L,	0x54de5729L,	0x23d967bfL,
	0xb3667a2eL,	0xc4614ab8L,	0x5d681b02L,	0x2a6f2b94L,
	0xb40bbe37L,	0xc30c8ea1L,	0x5a05df1bL,	0x2d02ef8dL
};

/* Check the crc32 of block bl against the given crc */
static inline bool
check_crc32(byte *bl, u32 sz, u32 crc)
{
	u32 cr = 0xffffffffL;
	if (sz < 1)
		return 1;
	
	do cr = (cr >> 8) ^ crcTable[(cr ^ *bl++) & 0xff]; while (--sz);
	cr ^= 0xffffffffL;
	return (crc != cr);
}

/* send symbol s through model m (compress) */
/* NOTE: inlining this function makes compression 3-4% slower  (why ???) */
static void
model_symbol(u32 s, Model *m, u32 ns, aStream *as)
{
	send_symbol(s, m->fq, m->ft, as);
	update_model(s, m, ns);
}

/* Send the length of a run of zeroes, MSB first */
static inline void
send_zeroes(u32 z, aStream *as)
{
	u32 i = 0, t = ++z;
	do {
		i++;
		t >>= 1;
	} while (t > 1);
	do {
		i--;
		model_symbol((z & (1 << i)) ? AVB_R2 : AVB_R1, AM_BASE, as);
	} while (i);
}

/* Pipe a block through MTF and then the structured model */
static void
mtf_send_block(p32_8 *bl, u32 sz, aStream *as)
{
	byte mt[0x100];
	u32 i, crc = 0xffffffffL;

	i = 0xff;
	do mt[i] = (byte)i; while (--i);
	mt[0] = 0x00;
	do {
		byte b = ref(bl[0])[-1].b.b4;
		crc = (crc >> 8) ^ crcTable[(crc ^ b) & 0xff];

		if (b == mt[0]) {
			i++;
		} else {
			byte t1, t2;

			/* The MTF loop is divided into pieces
			 * which match the structured model,
			 * so I know which model I'm in
			 * when I find the byte.
			 * Unfortunately, I need a goto.. */
			if (i) send_zeroes(i, as);
			if (b == mt[1]) {
				mt[1] = mt[0];
				model_symbol(AVB_ONE, AM_BASE, as);
				goto ugly_label;
			}
			t2 = mt[0];
			t1 = mt[1];
			mt[1] = t2;
			i = 1;
			do {
				t2 = t1;
				i++;
				t1 = mt[i];
				mt[i] = t2;
				if (t1 == b) {
					model_symbol(AVB_02_03, AM_BASE, as);
					model_symbol(i - 0x02, AM_02_03, as);
					goto ugly_label;
				}
			} while (i < 0x03);
			do {
				t2 = t1;
				i++;
				t1 = mt[i];
				mt[i] = t2;
				if (t1 == b) {
					model_symbol(AVB_04_07, AM_BASE, as);
					model_symbol(i - 0x04, AM_04_07, as);
					goto ugly_label;
				}
			} while (i < 0x07);
			do {
				t2 = t1;
				i++;
				t1 = mt[i];
				mt[i] = t2;
				if (t1 == b) {
					model_symbol(AVB_08_0F, AM_BASE, as);
					model_symbol(i - 0x08, AM_08_0F, as);
					goto ugly_label;
				}
			} while (i < 0x0F);
			do {
				t2 = t1;
				i++;
				t1 = mt[i];
				mt[i] = t2;
				if (t1 == b) {
					model_symbol(AVB_10_1F, AM_BASE, as);
					model_symbol(i - 0x10, AM_10_1F, as);
					goto ugly_label;
				}
			} while (i < 0x1F);
			do {
				t2 = t1;
				i++;
				t1 = mt[i];
				mt[i] = t2;
				if (t1 == b) {
					model_symbol(AVB_20_3F, AM_BASE, as);
					model_symbol(i - 0x20, AM_20_3F, as);
					goto ugly_label;
				}
			} while (i < 0x3F);
			do {
				t2 = t1;
				i++;
				t1 = mt[i];
				mt[i] = t2;
				if (t1 == b) {
					model_symbol(AVB_40_7F, AM_BASE, as);
					model_symbol(i - 0x40, AM_40_7F, as);
					goto ugly_label;
				}
			} while (i < 0x7F);
			do {
				t2 = t1;
				i++;
				t1 = mt[i];
				mt[i] = t2;
			} while (t1 != b);
			model_symbol(AVB_80_FF, AM_BASE, as);
			model_symbol(i - 0x80, AM_80_FF, as);
		ugly_label:
			mt[0] = b;
			i = 0;
		}
		bl++;
	} while (--sz);
	if (i) send_zeroes(i, as);

	/* EOB is packed in with the 80_FF submodel */
	model_symbol(AVB_80_FF, AM_BASE, as);
	model_symbol(0x80, AM_80_FF, as);

	crc ^= 0xffffffffL;
	send_u32(crc, as);
}

/* The Burrows-Wheeler Transform */

static u32 bSize = 0;	/* Blocksize (times 64k) */
static u32 memSize = 0;	/* Size of memory allocated (decompress) */
#define BUFFER_SIZE (bSize - 2)

#define NO_BUCKETS 0x10000

static u32_8 *bStart;
static u32 *bEnd;

#if 0
/* Return the middle between two p32_8 * pointers */
static inline p32_8 *
avg_pointer(p32_8 *a, p32_8 *b)
{
	/* Weird construction needed, because they're pointers */
	return a + ((a + b) / 2);
}
#endif

/* quicksort c from l to r, assuming the first d chars are equal.
 * recursive version (only compare the d-th word)
 */
static void
quick_sort(p32_8 *l, p32_8 *r, u32 d)
{
	register p32_8 *i, *j;
	p32_8 *k, t;
	u32 *b_end;

quicksort_tail_recurse:
#if 0
	i = avg_pointer(l, r);	/* Is this necessary ?? */
	t = *i;
	*i = *l;
	*l = t;
#else
	t = *l;
#endif
	i = l;
	j = i + 1;
	k = r;
	b_end = bEnd;

	/* t[d] should not be out of bounds */
	if ((ref_u32(t) + d) < b_end)
	{
		register u32 h = *(ref_u32(t) + d);
		do {
			register u32 *td;
			t = *j;
			td = ref_u32(t) + d;
			/* if t is shorter than d, it's smaller than h */
			if ((td >= b_end) || (*td < h)) {
				/* t < h */
				*j = *i;
				*i = t;
				i++; j++;
			} else if (*td > h) {
				/* t > h */
				k--;
				*j = *k;
				*k = t;
			} else {
				/* t == h */
				j++;
			}
		} while (k > j);
	} else {	/* Divide on length */
		register p32_8 h = t;
		do {
			t = *j;
			/* if t is shorter than d, it's smaller than h */
			if (t > h) {
				/* t < h */
				*j = *i;
				*i = t;
				i++; j++;
			} else {
				/* t > h */
				k--;
				*j = *k;
				*k = t;
			}
			/* t == h is impossible. */
		} while (k > j);
	}
	if (i > (l + 1))
		quick_sort(l, i, d);
	if (r > (j + 1))
		quick_sort(j, r, d);
	if (j > (i + 1)) {
		/* quick_sort(i, j, d + 4); */
		l = i;
		r = j;
		d += 4;
		goto quicksort_tail_recurse;
	}
}

/* 256-way radix sort */
static inline void
radix_sort(p32_8 *l, p32_8 *r)
{
	u32 b[0x100], b2[0x100];

	/* 0x200 is an educated guess.. */
	if (r < (l + 0x200)) {
		quick_sort(l, r, 2);
		return;
	}
	/* Empty buckets */
	{
		u32 i = 0x100;
		do b[i-1] = 0; while (--i);
	}
	/* calc sizes */
	{
		p32_8 *i = l;
		do ++b[(ref(*i))->b.b2]; while (++i < r);
	}
	/* divide buckets */
	{
		u32 *i = &b[0xff], j = r - l;
		b2[0xff] = j;
		do {
			*i = (j -= *i);
			i--;
		} while (j);
	}
	/* copy b into b2 */
	{
		u32 *i = b + 0x100, *p = b + 1;
		do p[b2 - b - 1] = p[0]; while (++p < i);
	}
	/* distribute data:
	 * To avoid a temp array, I distribute the
	 * data in permutation cycles.
	 * This may seem hairy, and it is.. */
	{
		/* ASM: Compiler doesn't put t in a register */
		p32_8 t;
		u32 k = 0;
		t = *l;
		*l = 0;
		do {
			p32_8 *lb = l + (b[ref(t)->b.b2]++);
			p32_8 t2 = *lb;
			*lb = t;
			if (t2) {
				t = t2;
			} else {
				/* New cycle */
				u32 bk, kk = k;
				while ((bk = b[kk]) == b2[kk])
					if (++kk == 0x100)
						goto radix_loop_end;
				t = l[bk];
				l[bk] = 0;
				k = kk;
			}
		} while (1);
	radix_loop_end:
	}
	/* sort buckets */
	{
		u32 *i = b, *p = b + 0xff;
		do {
			if (p[0] > (p[-1] + 1))
				quick_sort(l + p[-1], l + p[0], 3);
		} while (--p > i);
		if (b[0] > 1)
			quick_sort(l, l + b[0], 3);
	}
}

/* Sort the buckets *b */
static void
sort_bucket(u32 *b, register p32_8 *c, byte s[], u32 j)
{
	u32 b2[0x100];
	u32 bc = s[j-1];
	u32 *b1 = &b[bc << 8];

	if ((b1[0x100] - b1[0]) < 2)
		return;

	/* Sort the buckets 'ab' that aren't already sorted
	 * by creation (see below) */
	if (--j) do {
		register u32 i = s[j-1];

		if (b1[i+1] > (b1[i] + 1))
			radix_sort(c + b1[i], c + b1[i+1]);
	} while (--j);

	if (bc == (bStart->b.b4)) {
		u32 *b3;
		u32 i;
		u32_8 *b_start = bStart;

		/* Create lower half of buckets '?a' from below */
		i = 0xff;
		b3 = b + bc + 0xff00;
		do {
			b2[i] = *b3;
			b3 -= 0x100;
		} while (--i);
		b2[0] = *b3;
		b3 = &b2[bc];
		for (i = b1[0]; i != *b3; i++)
			if (ref(c[i]) > b_start)
				c[b2[ref(c[i])[-1].b.b4]++] = c[i] - 1;

		/* Create upper half of buckets '?a' */
		i = 0xff;
		b3 = b + bc + 0xff01;
		do {
			b2[i] = *b3 - 1;
			b3 -= 0x100;
		} while (--i);
		b2[0] = *b3 - 1;
		b3 = &b2[bc];
		for (i = b1[0x100] - 1; i != *b3; i--)
			if (ref(c[i]) > b_start)
				c[b2[ref(c[i])[-1].b.b4]--] = c[i] - 1;
	} else {
		/* Create lower half of buckets '?a' from below */
		u32 *b3;
		u32 i = 0xff;
		b3 = b + bc + 0xff00;
		do {
			b2[i] = *b3;
			b3 -= 0x100;
		} while (--i);
		b2[0] = *b3;
		b3 = &b2[bc];
		for (i = b1[0]; i != *b3; i++)
			c[b2[ref(c[i])[-1].b.b4]++] = c[i] - 1;

		/* Create upper half of buckets '?a' */
		i = 0xff;
		b3 = b + bc + 0xff01;
		do {
			b2[i] = *b3 - 1;
			b3 -= 0x100;
		} while (--i);
		b2[0] = *b3 - 1;
		b3 = &b2[bc];
		for (i = b1[0x100] - 1; i != *b3; i--)
			c[b2[ref(c[i])[-1].b.b4]--] = c[i] - 1;
	}

	/* replace low 24 bits with bucket index */
	{
		u32 i = b1[0x100] - b1[0];
		p32_8 *c2 = c + b1[0x100] - 1;
		byte b = bc;

		do {
			ref(c2[0])->l = i;
			ref(c2[0])->b.b4 = b;
			--c2;
		} while (--i);
	}
}

/* Sort buckets b by size */
static void
qsort_size(byte *l, byte *r, u32 *b)
{
	byte *i, *j, *k, t;
	u32 h;

qsort_size_tail_recurse:
	t = l[(r - l) / 2];
	h = b[(t << 8) + 0x100] - b[t << 8];

	i = j = l;
	k = r;
	do {
		t = *j;
		if ((b[(t << 8) + 0x100] - b[t << 8]) < h) {
			--k;
			*j = *k;
			*k = t;
		} else {
			if ((b[(t << 8) + 0x100] - b[t << 8]) > h) {
				*j = *i;
				*i = t;
				i++;
			}
			j++;
		}
	} while (k > j);
	if (i > (l + 1))
		qsort_size(l, i, b);
	if (r > (j + 1)) {
		l = j;
		goto qsort_size_tail_recurse;
	}
}

/* reverse and stripe block src into block tgt,
 * (and make tgt[0] equal tgt[sz] )
 */
static inline void
stripe_block(byte *src, u32_8 *tgt, u32 sz)
{
	u32 i = sz, v = 0;
	do {
		v >>= 8;
		v |= (*src++) << 24;
		tgt[i].l = v;
	} while (--i);
	tgt[0].l = tgt[sz].l;
}

/* Main BW function.
 * Does radix sort and calls sort_bucket on all buckets
 * in order of size (smallest first) */
static u32
do_bwt(u32 l)
{
	byte s[0x100];

	stripe_block(inB, sB, l);

	/* Init some global stuff */
	bStart = &sB[1];
	bEnd = (u32 *)&sB[l+1];

	/* Empty buckets */
	{
		u32 *b = rdxB, j = 0;
		u32 i = NO_BUCKETS;
		do b[i] = j; while (--i);
	}
	/* Calc bucket sizes */
	{
		u32 *b = rdxB + 1;
		u32_8 *b2 = sB;
		u32 i = l;
		do ++b[b2[i].w.w2]; while (--i);
	}
	
	/* Divide buckets */
	{
		u32 *b = rdxB, j = l;
		u32 i = NO_BUCKETS;
		do b[i] = (j -= b[i]); while (--i);
	}
	
	/* Order buckets by size */
	{
		u32 i = 0xff;
		do s[i] = (byte)i; while (--i);
		s[0] = i;
	}
	qsort_size(s, s + 0x100, rdxB + 1);

	/* Distribute data over buckets.
	 * NB: It is important that the buckets are filled in
	 *     reverse because of a special case involving EOF
	 *     (a,EOF ends up in bucket a,0 and should be the first)
	 */
	{
		u32 *b = rdxB + 1;
		u32_8 *b2 = &sB[l];
		p32_8 *c = ptrB;
		do c[b[b2->w.w2]++] = deref(b2); while (--b2 > sB);
	}

	/* Sort the buckets */
	{
		u32 i = 0x100;
		rdxB[0] = 0;
		do sort_bucket(rdxB, ptrB, s, i); while (--i);
	}
	
	/* Find origin pointer */
	{
		p32_8 *c = ptrB;
		u32_8 *b2 = sB + 1;
		u32 i = l;
		while (--i)
			if (ref(c[i]) == b2)
				return i;
		return 0;
	}
}

static u32 inBytes, outBytes;

/* Write out compression statistics */
static void
print_stats(u32 in, u32 out)
{
	fprintf(stderr, "%.3f:1, %.3f bits/byte, "
			"%.2f%% saved, %u in, %u out.\n",
		(float)in / (float)out, (float)(out * 8) / (float)in,
		100.0 - (float)(out * 100) / (float)in, in, out);
}

/* Compress from stream in to stream out */
static bool
compress(FILE *in, FILE *out)
{
	u32 inb, outb;
	aStream as;
	u32 op, sz, bsz = BUFFER_SIZE;

	if (isatty(fileno(out))) {
		terminal_error(outFileName);
		return 1;
	}

	init_models();
	if (fputs("BWC", out) == EOF) {
		output_error();
		return 1;
	}
	sz = fread(inB, 1, bsz, in);
	bsz = sz;
	if (ferror(in)) {
		input_error();
		return 1;
	}
	start_coder(out, bsz >> BLOCK_BITS, &as);

	while (sz) {
		op = do_bwt(sz);
		send_u32(op, &as);
		mtf_send_block(ptrB, sz, &as);
		if (ferror(in)) {
			input_error();
			return 1;
		}
		sz = fread(inB, 1, bsz, in);
	}
	if (end_coder(&as))
		return 1;

	if (Flg.verbose) {
		inb = ftell(in);
		outb = ftell(out);
		inBytes += inb;
		outBytes += outb;
		print_stats(inb, outb);
	}
	if (fclose(in) == EOF) {
		input_error();
		return 1;
	}
	if (fclose(out) == EOF) {
		output_error();
		return 1;
	}
	return 0;
}

/* The input block is spread over the MSBs of the vecB array
 * Only works if sz < (1 << 24)  (16 Mb)
 * Why the hell is this faster ????? */
static inline bool
unbwt_and_write(u32 sz, u32 op, FILE *out)
{
	u32 b[0x100];

	if (op >= sz) {
		block_error();
		return 1;
	}
	{
		u32 i = 0xff, j = 0;
		do b[i] = j; while (--i);
		b[0] = j;
	}
	{
		u32 i = sz;
		u32_8 *v = vecB - 1;
		byte *d = (byte *)vecB - 1;
		do v[i].l = d[i] << 24; while (--i);
	}
	{
		/* ASM: compiler could leave unneeded instructions */
		u32_8 *v = vecB, *e = v + op;
		e->l += b[e->b.b4]++;
		while (v < e) {
			v->l += b[v->b.b4]++;
			v++;
		}
		e = vecB + sz;
		while (++v < e)
			v->l += b[v->b.b4]++;
	}
	{
		u32 i = 0xff, j = sz;
		do b[i] = (j -= b[i]); while (--i);
		b[0] = i;
	}
	{
		u32 i = sz;
		u32_8 *v = vecB;
		do {
			u32 in_op = v[op].b.b4;
			putc(in_op, out);
			op = (low24(v[op])) + b[in_op];
		} while (--i);
	}
	if (ferror(out)) {
		output_error();
		return 1;
	}
	return 0;
}

/* Allocate big memory for decompression
 * (unless there already was a big enough block) */
static inline bool
decompress_realloc(u32 bsz)
{
	void *new_mem;
	bsz <<= BLOCK_BITS;
	bSize = bsz;

	if ((bsz * sizeof(u32)) <= memSize)
		return 0;
	new_mem = realloc((void *)vecB, bsz * sizeof(u32));
	if (new_mem) {
		memSize = bsz * sizeof(u32);
		vecB = (u32_8 *)new_mem;
		return 0;
	}
	fprintf(stderr, "%s: not enough memory for decompressing "
		"%s with buffersize %u (needed %u bytes)\n",
		myName, inFileName, bsz, bsz * sizeof(u32));
	Flg.error = 1;
	return 1;
}

/* Decompress from stream in to stream out */
static bool
decompress(FILE *in, FILE *out)
{
	aStream as;
	u32 bsz, sz, op;

	if (isatty(fileno(in))) {
		terminal_error(inFileName);
		return 1;
	}

	/* Get the magic word */
	if ((fgetc(in) != 'B') || (fgetc(in) != 'W') || (fgetc(in) != 'C'))
	{
		magic_error();
		return 1;
	}
	sz = getc(in);
	if (decompress_realloc(sz + 1)) {
		fclose(in);
		fclose(out);
		return 1;
	}

	init_models();
	start_decoder(in, &as);
	for(;;) {
		op = get_u32(&as);
		if (feof(in))
			break;
		bsz = mtf_get_block((byte *)vecB, ((byte *)vecB) + BUFFER_SIZE, &as);
		if (check_crc32((byte *)vecB, bsz, get_u32(&as))) {
			block_error();
			return 1;
		}
		if (!Flg.test && unbwt_and_write(bsz, op, out))
			return 1;
	}
	if (fclose(in) == EOF) {
		input_error();
		return 1;
	}
	if (!Flg.test && (fclose(out) == EOF)) {
		output_error();
		return 1;
	}
	return 0;
}

/* housekeeping and main() */

/* For error reporting and recovery */
static FILE *inFileHandle, *outFileHandle;

/* Allocate the big memory blocks for compressing
 * Exits on failure. */
static inline void
compress_alloc(void)
{
	u32 bsz = bSize;

	if (Flg.verbose)
		fprintf(stderr, "%s: compressing with blocksize %uKb\n",
			myName, bsz >> 10);
	sB = (u32_8 *)malloc(bsz * sizeof(u32) +
		bsz * sizeof(p32_8) + 0x10001 * sizeof(u32));
	if (sB == NULL) {
		fprintf(stderr, "%s: not enough memory for compressing "
			"with buffersize %u (needed %u bytes)\n",
			myName, bsz, bsz * sizeof(u32) +
			bsz * sizeof(p32_8) + 0x10001 * sizeof(u32));
		exit(1);
	}
	ptrB = (p32_8 *)(&sB[bsz]);
	rdxB = (u32 *)(&ptrB[bsz]);
	/* Hack: Save memory by reusing memory
	 * This block is discarded after striping
	 * (and thus before radix, which uses ptrB) */
	inB = (byte *)ptrB;
}

/* Print license and compile info */
static inline void
bwc_license(void)
{
	fprintf(stderr,
"%s 0.99 (1 Feb 1998) (extension: " BWC_EXT ")\n"
"   Copyright (C) 1998 Willem Monsuwe\n\n"
"   This program is free software; you can redistribute it and/or modify\n"
"   it under the terms of the GNU General Public License as published by\n"
"   the Free Software Foundation; either version 2 of the License, or\n"
"   (at your option) any later version.\n\n"
"   This program is distributed in the hope that it will be useful,\n"
"   but WITHOUT ANY WARRANTY; without even the implied warranty of\n"
"   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the\n"
"   GNU General Public License for more details.\n\n"
"   You should have received a copy of the GNU General Public License\n"
"   along with this program; if not, write to the Free Software\n"
"   Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.\n"
		, myName);
	exit(1);
}

/* Print usage and exit */
static inline void
bwc_usage(void)
{
	fprintf(stderr,
"usage: %s [-dckv-] [-m[size]] [files]\n\n"
"   -d:       decompress.\n"
"   -c:       output to stdout.\n"
"   -t:       test BWC file integrity. (implies -d)\n"
"   -k:       keep input files.\n"
"   -v:       output compression statistics.\n"
"   -L:       display software license.\n"
"   --:       treat the rest of the line as files (for files starting with -)\n"
"   -m<size>: max compression block size. (rounded down to 64k blocks)\n"
"             You can specify the size in kilobytes or megabytes with k or m\n"
"   -m without an argument sets biggest possible blocksize.\n"
"      (the blocksize is minimized to the size of the largest file,\n"
"       so if you compress from stdin you'll need over 128Mb)\n"
"   Memory usage:\n"
"     Compression uses eight times the given block size plus a 256k table,\n"
"     Decompression uses four times the block size of the BWC file.\n"
"   Example:\n"
"     %s -vm1.8m big.file\n"
"     compresses big.file in 1.8 meg blocks (using 14.75 megabytes of memory).\n"
"   Default block size: %uk  Max block size: 16m (uses 128.25 megabytes)\n\n"
		, myName, myName, DEF_BLOCK_SIZE * 64);
	exit(1);
}

static inline void
unbwc_usage(void)
{
	fprintf(stderr,
"usage: %s [-ck-] [files]\n\n"
"   -c: output to stdout.\n"
"   -t: test BWC file integrity.\n"
"   -k: keep input files.\n"
"   -L: display software license.\n"
"   --: treat the rest of the line as files (for files starting with -)\n\n"
"   Memory usage:\n"
"     Decompression uses four times the block size of the BWC file.\n"
		, myName);
	exit(1);
}

/* Parse a size indicator */
static inline u32
parse_size(char **c)
{
	char *p = *c;
	u32 i, f;

	i = 0;
	while ((*p >= '0') && (*p <= '9')) {
		i *= 10;
		i += (*p - '0');
		p++;
	}
	if ((*p == '.') || (*p == ','))
		p++;
	*c = p;
	while ((*p >= '0') && (*p <= '9'))
		p++;
	if (*p == 'k') {
		f = 1 << 10;
		i <<= 10;
	} else if (*p == 'm') {
		f = 1 << 20;
		i <<= 20;
	} else {
		f = 0;
	}
	p = *c;
	while ((*p >= '0') && (*p <= '9')) {
		if (f) {
			f /= 10;
			i += f * (*p - '0');
		}
		p++;
	}
	if ((*p == 'k') || (*p == 'm'))
		p++;
	*c = p;
	return i;
}

/* Struct for holding input/output filename pairs */
static char **ioFiles;
static u32 noFiles;
static u32 maxLen;

static inline void
copy_protection(char *in, char *out)
{
	struct stat st;
	struct utimbuf ut;

	if (stat(in, &st)) {
		fprintf(stderr, "%s: Failed to get filebits of %s: %s\n",
			myName, in, strerror(errno));
		return;
	}
	ut.actime = st.st_atime;
	ut.modtime = st.st_mtime;

	if (chmod(out, st.st_mode))
		fprintf(stderr, "%s: Failed to set filebits of %s: %s\n",
			myName, out, strerror(errno));
	if (utime(out, &ut))
		fprintf(stderr, "%s: Failed to set date of %s: %s\n",
			myName, out, strerror(errno));
}

static inline bool
regular_file(char *file)
{
	struct stat st;

	if (stat(file, &st))
		return 1;
	return (S_ISREG(st.st_mode));
}

/* return max(filesize(file), ms) */
static inline u32
max_file_size(char *file, u32 ms)
{
	struct stat st;

	if (stat(file, &st))
		return ms;
	if ((u32)st.st_size <= ms)
		return ms;
	return (u32)st.st_size;
}

/* Parse the commandline arguments.
 * exits if asked for usage. */
static inline void
parse_args(int argc, char *argv[])
{
	int i;
	u32 n, minSize;
	bool opts;

	noFiles = 0;
	ioFiles = NULL;

	Flg.error = 0;
	Flg.compressing = 1;
	Flg.keep_input = 0;
	Flg.verbose = 0;
	Flg.to_stdout = 0;
	Flg.unbwz = 0;
	Flg.test = 0;
	opts = 1;
	myName = strrchr(argv[0], PATH_SC);
	if (myName) myName++; else myName = argv[0];
	if (((myName[0] == 'u') || (myName[0] == 'U')) &&
		((myName[1] == 'n') || (myName[1] == 'N'))) {
		Flg.compressing = 0;
		Flg.unbwz = 1;
	}
	bSize = DEF_BLOCK_SIZE * 64 * 1024;
	maxLen = 5;
	minSize = 1;
	n = i = 0;
	while (++i < argc) {
		if (opts && (argv[i][0] == '-')) {
			char *c = argv[i];
			if (!c[1])
				opts = 0;
			if (Flg.unbwz) while (*++c) switch(*c) {
			default:
				fprintf(stderr, "%s: Unknown flag -%c\n",
					myName, *c);
				unbwc_usage();
			case 'L':
				bwc_license();
			case 'c':
				Flg.to_stdout = 1;
				break;
			case 'k':
				Flg.keep_input = 1;
				break;
			case 't':
				Flg.test = 1;
				Flg.compressing = 0;
				break;
			case '-':
				opts = 0;
				break;
			} else while (*++c) switch(*c) {
			default:
				fprintf(stderr, "%s: Unknown flag -%c\n",
					myName, *c);
				bwc_usage();
			case 'L':
				bwc_license();
			case 'd':
				Flg.compressing = 0;
				break;
			case 'c':
				Flg.to_stdout = 1;
				break;
			case 'v':
				inBytes = outBytes = 0;
				Flg.verbose = 1;
				break;
			case 'k':
				Flg.keep_input = 1;
				break;
			case 't':
				Flg.test = 1;
				break;
			case '-':
				opts = 0;
				break;
			case 'm':
				bSize = 0;
				if ((!*++c) && (++i < argc)) {
					if ((argv[i][0] < '0') ||
						(argv[i][0] > '9')) {
						i--;
						c--;
						bSize = 0x100 << BLOCK_BITS;
						break;
					}
					c = argv[i];
				}
				bSize = parse_size(&c);
				if (!bSize)
					bSize = 0x100 << BLOCK_BITS;
				c--;
			}
		} else {
			if (n >= noFiles) {
				noFiles += 32;
				ioFiles = (char **)realloc(ioFiles, noFiles *
					sizeof(char *));
				if (!ioFiles)
					cmdline_error();
			}
			ioFiles[n] = argv[i];
			if (regular_file(ioFiles[n])) {
				if (strlen(ioFiles[n]) > maxLen)
					maxLen = strlen(ioFiles[n]);
				minSize = max_file_size(ioFiles[n], minSize);
				n++;
			} else {
				fprintf(stderr, "%s: %s is not a "
					"regular file, skipping\n",
					myName, ioFiles[n]);
			}
		}
	}
	noFiles = n;
	if (noFiles && (minSize + 1 < bSize))
		bSize = minSize + 1;
	bSize += (1 << BLOCK_BITS) - 1;
	bSize &= ~((1 << BLOCK_BITS) - 1);
	if ((bSize > (1 << (BLOCK_BITS + 8)))) {
		fprintf(stderr, "%s: -m requires a size argument less than 16m\n",
			myName);
		bwc_usage();
	}
	outFileName = (char *)malloc(maxLen + 1 +
		(Flg.compressing ? strlen(BWC_EXT) : 0));
}

/* Compress file */
static void
compress_file(char *ifile)
{
	char *ofile = outFileName;
	FILE *in, *out;

	inFileHandle = NULL;
	outFileHandle = NULL;
	inFileName = ifile;
	strcpy(ofile, ifile);
	strcat(ofile, BWC_EXT);

	out = fopen(ofile, "r");
	if (out) {
		fclose(out);
		fprintf(stderr, "%s: output file %s exists, skipping.\n",
			myName, ofile);
		return;
	}
	in = fopen(ifile, "rb");
	if (!in) {
		input_error();
		return;
	}
	out = fopen(ofile, "wb");
	if (!out) {
		output_error();
		return;
	}
	inFileHandle = in;
	outFileHandle = out;
	if (Flg.verbose)
		fprintf(stderr, "%-*s: ", (int)maxLen, inFileName);
	if (compress(in, out))
		return;
	copy_protection(ifile, ofile);
	if ((!Flg.keep_input) && remove(ifile))
		fprintf(stderr, "%s: failed to remove input file %s: %s\n",
			myName, ifile, strerror(errno));
}

/* Decompress file */
static void
decompress_file(char *ifile)
{
	char *ofile = outFileName;
	FILE *in, *out = NULL;
	int t;

	inFileHandle = NULL;
	outFileHandle = NULL;
	inFileName = ifile;
	if (!Flg.test) {
		t = strlen(ifile) - strlen(BWC_EXT);
		if (strcasecmp(ifile + t, BWC_EXT)) {
			fprintf(stderr, "%s: input file %s doesn't end in '%s'\n",
				myName, ifile, BWC_EXT);
			return;
		}
		memcpy(ofile, ifile, t);
		ofile[t] = 0;

		out = fopen(ofile, "r");
		if (out) {
			fclose(out);
			fprintf(stderr, "%s: output file %s exists, skipping.\n",
				myName, ofile);
			return;
		}
	}
	in = fopen(ifile, "rb");
	if (!in) {
		input_error();
		return;
	}
	if (!Flg.test) {
		out = fopen(ofile, "wb");
		if (!out) {
			output_error();
			return;
		}
		outFileHandle = out;
	}
	inFileHandle = in;
	if (decompress(in, out))
		return;
	if (!Flg.test) {
		copy_protection(ifile, ofile);
		if ((!Flg.keep_input) && remove(ifile))
			fprintf(stderr, "%s: failed to remove input file %s: %s\n",
				myName, ifile, strerror(errno));
	}
}

/* Check some stuff. This function should either be compiled
 * away, or call exit(1)
 * Hmm.. compiler doesn't throw away unused string constants.. */
static inline void
check_stuff(void)
{
	if ((sizeof(u32) == 4) && (sizeof(u16) == 2) &&
		(sizeof(byte) == 1) && (sizeof(u32_8) == 4))
		return;
#ifdef DEBUG
	fputs("WARNING: bwc was not compiled correctly:\n", stderr);
	if (sizeof(u32) != 4)
		fputs("sizeof(u32) != 4\n", stderr);
	if (sizeof(u16) != 2)
		fputs("sizeof(u16) != 2\n", stderr);
	if (sizeof(byte) != 1)
		fputs("sizeof(byte) != 1\n", stderr);
	if (sizeof(u32_8) != 4)
		fputs("sizeof(u32_8) != 4\n", stderr);
	fputs("Please typedef these correctly and recompile.\n", stderr);
#endif
	exit(1);
}

static void catch_sig(int);

void
main(int argc, char *argv[])
{
	u32 i;
	FILE *in = stdin;

	check_stuff();
	signal (SIGINT, catch_sig);
	signal (SIGTERM, catch_sig);
	signal (SIGHUP, catch_sig);
	signal (SIGSEGV, catch_sig);
	signal (SIGBUS, catch_sig);

	parse_args(argc, argv);
	if (Flg.compressing)
		compress_alloc();
	if (noFiles && !Flg.to_stdout) {
		if (Flg.compressing) {
			for (i = 0; i < noFiles; i++) {
				compress_file(ioFiles[i]);
			}
			if (Flg.verbose && (noFiles > 1)) {
				fprintf(stderr, "\n%-*s: ", (int)maxLen,
					"Total");
				print_stats(inBytes, outBytes);
			}
		} else {
			for (i = 0; i < noFiles; i++) {
				decompress_file(ioFiles[i]);
			}
		}
		exit (0);
	}
	free(outFileName);
	inFileName = "(stdin)";
	outFileName = "(stdout)";
	inFileHandle = NULL;
	outFileHandle = NULL;
	if (Flg.to_stdout) {
		if (noFiles != 1) {
			fprintf(stderr, "%s: -c requires exactly one file\n",
				myName);
			exit(1);
		}
		inFileName = ioFiles[0];
		in = fopen(inFileName, "rb");
		if (!in) {
			input_error();
			exit(1);
		}
	}
	if (Flg.compressing)
		compress(in, stdout);
	else
		decompress(in, stdout);
	exit(Flg.error);
}

/* Error handling: remove output file */
static inline void
remove_outfile(void)
{
	if (inFileHandle)
		fclose(inFileHandle);
	if (outFileHandle) {
		fclose(outFileHandle);
		fprintf(stderr, "%s: removing output file %s",
			myName, outFileName);
		if (remove(outFileName))
			fprintf(stderr, "...FAILED! (%s)", strerror(errno));
		putc('\n', stderr);
	}
	Flg.error = 1;
}

static void
catch_sig(int n)
{
	char *s, *e = "(This is a bug. Please report it to me at: "
			"willem@stack.nl)\n";
	switch(n) {
		case SIGINT: s = "Ctrl-C"; e = ""; break;
		case SIGHUP: s = "hangup signal"; e = ""; break;
		case SIGTERM: s = "termination signal"; e = ""; break;
		case SIGSEGV: s = "segmentation fault"; break;
		case SIGBUS: s = "bus error"; break;
		default: s = "Unknown signal"; break;
	}
	fprintf(stderr, "\n%s: Caught %s, quitting.\n%s", myName, s, e);
	remove_outfile();
	exit(1);
}

static void
input_error(void)
{
	fprintf(stderr, "%s: I/O error reading %s: %s\n",
		myName, inFileName, strerror(errno));
	remove_outfile();
}

static void
output_error(void)
{
	fprintf(stderr, "%s: I/O error writing %s: %s\n",
		myName, outFileName, strerror(errno));
	remove_outfile();
}

static void
block_error(void)
{
	fprintf(stderr, "%s: BWC file %s is corrupt\n",
		myName, inFileName);
	remove_outfile();
}

static void
magic_error(void)
{
	fprintf(stderr, "%s: %s is not a BWC file.\n",
		myName, inFileName);
	remove_outfile();
}

static void
terminal_error(char *name)
{
	fprintf(stderr, "%s: no compressed I/O on a terminal: %s\n",
		myName, name);
	remove_outfile();
}

static void
cmdline_error(void)
{
	fprintf(stderr, "%s: Couldn't allocate memory for "
		"commandline parsing.\n", myName);
	exit(1);
}

