////////////////////////////////////////////////////////////////////////
//	UDA Core 0.301 (2006.12.19) Author:dwing
////////////////////////////////////////////////////////////////////////
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
typedef unsigned char  U8;
typedef unsigned short U16;
typedef unsigned int   U32;
////////////////////////////////////////////////////////////////////////
// 0x100000/0x200000:	96/179 MB
const  U32 MEM	=0x200000;	// 0x10000 ~ 0x2000000
static U32 type =0; // 0=Default 1=EXE
static U32 y	=0; // Last bit,0 or 1,set by encoder
static U32 c0	=1; // Last 0-7 bits of the partial byte with lead 1 bit
static U32 c1=0,c2=0,c3=0;//Last 1,2,3 byte.
static U32 c4	=0; // Last 4 whole bytes,packed. Last byte is bits 0-7.
static U32 bpos =0; // bits in c0 (0 to 7)
static U32 pos	=0; // Number of input bytes in buf (not wrapped)
////////////////////////////////////////////////////////////////////////
template<class T> class Array
{
	T		*data;
	U8		*p;
	const U32 SIZE;
public:
	explicit Array(U32 i,U32 align=0);
			~Array()				{free(p);}
	T&		 operator[](U32 i)		{return data[i];}
	const T& operator[](U32 i)const {return data[i];}
	U32 	 size() const			{return SIZE;}
};
//static U32 memsize=0;
template<class T> Array<T>::Array(U32 i,U32 align):SIZE(i)
{
//	memsize+=align+SIZE*sizeof(T);printf("%u ",memsize);
	if(!(p=(U8*)calloc(align+SIZE*sizeof(T),1)))
	{
		printf("\nERROR: Not enough memory\n");
		exit(0);
	}
	data=(T*)(align?(p+align)-((U32)p&(align-1)):p);
}
////////////////////////////////////////////////////////////////////////
class Buf
{
	Array<U8> b;
public:
		Buf(U32 i):b(i) 		{}
	U32 size() const			{return b.size();}
	U8& operator[](U32 i)		{return b[i&(size()-1)];}
	U32 operator()(U32 i) const {return (U32)b[(pos-i)&(size()-1)];}
}buf(MEM*8);
////////////////////////////////////////////////////////////////////////
class Ilog	// ilog(x) = round(log2(x)*16),0<=x<65536
{
	Array<U8> t;
public:
	Ilog():t(65536)
	{
		U32 x=14155776;
		for(U32 i=2;i<65536;++i)
			t[i]=(x+=774541002/(i*2-1))>>24;	// 2^29/ln 2
	}
	U32 operator()(U16 x) const {return t[x];}
}ilog;
////////////////////////////////////////////////////////////////////////
U32 squash(int d) // return p = 1/(1+exp(-d)),d scaled 8b,p scaled 12b
{
	static const U32 t[33]=
	{	   1,	2,	 3,   6,  10,  16,	27,  45,
		  73, 120, 194, 310, 488, 747,1101,1546,
		2047,2549,2994,3348,3607,3785,3901,3975,
		4022,4050,4068,4079,4085,4089,4092,4093,4094
	};
	if(d> 2047) return 4095;
	if(d<-2047) return 0;
	const U32 w=d&127;
	d=(d>>7)+16;
	return (t[d]*(128-w)+t[(d+1)]*w+64) >> 7;
}
////////////////////////////////////////////////////////////////////////
// Inverse of squash. d = ln(p/(1-p)),d scaled by 8 bits,p by 12 bits.
class Stretch
{
	Array<int> t;
public:
	Stretch():t(4096)
	{
		U32 j=0;
		for(int x=-2047;x<=2047;++x)	// invert squash()
			for(U32 i=squash(x);j<=i;++j) t[j]=x;
		t[4095]=2047;
	}
	int operator()(U32 p) const {return t[p];}
}stretch;
////////////////////////////////////////////////////////////////////////
// Mixer m(N,M,S=1,w=0) combines models using M neural networks with
//	 N inputs each,of which up to S may be selected.  If S > 1 then
//	 the outputs of these neural networks are combined using another
//	 neural network (with parameters S,1,1).	If S = 1 then the
//	 output is direct.	The weights are initially w (+-32K).
//	 It is used as follows:
// m.update() trains the network where the expected output is the
//	 last bit (in the global variable y).
// m.add(stretch(p)) inputs prediction from one of N models.  The
//	 prediction should be positive to predict a 1 bit,negative for 0,
//	 nominally +-256 to +-2K.  The maximum allowed value is +-32K but
//	 using such large values may cause overflow ifN is large.
// m.set(cxt,range) selects cxt as one of 'range' neural networks to
//	 use.  0 <= cxt < range.  Should be called up to S times such
//	 that the total of the ranges is <= M.
// m.p() returns the output prediction that the next bit is 1 as a
//	 12 bit number (0 to 4095).
// dot_product returns dot product t*w of n elements.  n is rounded
// up to a multiple of 8.  Result is scaled down by 8 bits.
//int dot_product(short *t,short *w,U32 n)
//{
//	int sum=0;
//	n=(n+7)&-8;
//	for(U32 i=0;i<n;i+=2)
//		sum+=(t[i]*w[i]+t[i+1]*w[i+1]) >> 8;
//	return sum;
//}
__declspec(naked) int __stdcall dot_product(short *t,short *w,U32 n)
{__asm{
		mov 	ecx,[esp+12]	// n
		dec 	ecx 			// n rounding up
		mov 	edx,[esp+ 8]	// w
		and 	ecx,-8
		js		done_
		mov 	eax,[esp+ 4]	// t
		pxor	mm0,mm0 		// sum = 0
loop_:	movq	mm1,[eax+ecx*2	]// put halves of vector product in mm0
		pmaddwd mm1,[edx+ecx*2	]
		movq	mm2,[eax+ecx*2+8]
		pmaddwd mm2,[edx+ecx*2+8]
		psrad	mm1,8
		psrad	mm2,8
		paddd	mm0,mm1
		paddd	mm0,mm2
		sub 	ecx,8			// each loop sums 4 products
		jns 	loop_
		movq	mm1,mm0 		// add 2 halves of mm0 and return in eax
		psrlq	mm1,32
		paddd	mm0,mm1
		movd	eax,mm0
		emms
done_:	ret 	4*3
}}
// Train neural network weights w[n] given inputs t[n] and err.
// w[i] += t[i]*err,i=0..n-1.	t,w,err are signed 16 bits (+- 32K).
// err is scaled 16 bits (representing +- 1/2).w[i] is clamped to +- 32K
// and rounded.  n is rounded up to a multiple of 8.
//void train(short *t,short *w,U32 n,int err)
//{
//	n=(n+7)&-8;
//	for(U32 i=0;i<n;++i)
//	{
//		int wt=w[i]+(((t[i]*err*2>>16)+1)>>1);
//		if(wt<-32768) wt=-32768;
//		if(wt> 32767) wt= 32767;
//		w[i]=wt;
//	}
//}
__declspec(naked) void __stdcall train(short *t,short *w,U32 n,U32 err)
{__asm{ mov 	eax,[esp+16]		// err
		pcmpeqd mm1,mm1 			// 4 copies of 1 in mm1
		movd	mm0,eax
		psrlw	mm1,15
	  punpcklwd mm0,mm0 			// put 2 copies of err in mm0
		mov 	ecx,[esp+12]		// n
	  punpcklwd mm0,mm0 			// put 4 copies of err in mm0
		mov 	eax,[esp+ 4]		// t
		dec 	ecx 				// n/8 rounding up
		mov 	edx,[esp+ 8]		// w
		and 	ecx,-8
		js		done_
loop_:	movq	mm2,[edx+ecx*2	]	// w[i]
		movq	mm3,[eax+ecx*2	]	// t[i]
		movq	mm4,[edx+ecx*2+8]	// w[i]
		movq	mm5,[eax+ecx*2+8]	// t[i]
		paddsw	mm3,mm3
		paddsw	mm5,mm5
		pmulhw	mm3,mm0
		pmulhw	mm5,mm0
		paddsw	mm3,mm1
		paddsw	mm5,mm1
		psraw	mm3,1
		psraw	mm5,1
		paddsw	mm2,mm3
		paddsw	mm4,mm5
		movq	[edx+ecx*2	],mm2
		movq	[edx+ecx*2+8],mm4
		sub 	ecx,8				// each iteration adjusts 8 weights
		jns 	loop_
		emms
done_:	ret 	4*4
}}
////////////////////////////////////////////////////////////////////////
class Mixer
{
	const U32 N,S;		// max inputs,max context sets
	Array<short> tx;	// N inputs from add()
	Array<short> wx;	// N*M weights
	Array<U32> cxt; 	// S contexts
	U32 ncxt;			// number of contexts (0 to S)
	U32 base;			// offset of next context
	U32 nx; 			// Number of inputs in tx,0 to N
	Array<U32> pr;		// last result (scaled 12 bits)
	Mixer *mp;			// points to a Mixer to combine results
public:
	Mixer(U32 n,U32 k,U32 s=1,int w=0):N((n+7)&-8),S(s),tx(N,16),
			wx(N*k,16),cxt(S),ncxt(0),base(0),nx(0),pr(S),mp(0)
	{
		U32 i;
		for(i=0;i<S  ;++i) pr[i]=2048;
		for(i=0;i<N*k;++i) wx[i]=w;
		if(S>1) mp=new Mixer(S,1,1,0x7fff);
	}
	~Mixer() {if(S>1) delete mp;}
	void add(int x){tx[nx++]=x;}// Input x (call up to N times)
	void set(U32 cx,U32 range)
	{				// Set a context (call S times,sum of ranges <= M)
		cxt[ncxt++]=base+cx;
		base+=range;
	}
	// Adjust weights to minimize coding cost of last prediction
	void update()
	{
		for(U32 i=ncxt;i--;)
			train(&tx[0],&wx[cxt[i]*N],nx,((y<<12)-pr[i])*7);
		nx=base=ncxt=0;
	}
	U32 p() 					// predict next bit
	{
		while(nx&7) tx[nx++]=0; // pad
		if(mp)	// combine outputs
		{
			mp->update();
			for(U32 i=ncxt;i--;)
			{
				pr[i]=squash(dot_product(&tx[0],&wx[cxt[i]*N],nx)>>5);
				mp->add(stretch(pr[i]));
			}
			mp->set(0,1);
			return mp->p();
		}
		return pr[0]=squash(dot_product(&tx[0],&wx[0],nx)>>8);
	}
}m(512,(16+256*3+512*13),4,128);
////////////////////////////////////////////////////////////////////////
// APM maps a probability and a context into a new probability
// that bit y will next be 1.  After each guess it updates
// its state to improve future guesses.  Methods:
// APM a(N) creates with N contexts,uses 66*N bytes memory.
// a.p(pr,cx,rate=8) returned adjusted probability in context cx (0 to
//	 N-1). rate determines the learning rate (smaller=faster,default 8).
//	 Probabilities are scaled 16 bits (0-65535).
class APM
{
	U32 index;		// last p,context
	const U32 N;	// number of contexts
	Array<U16> t;	// [N][33]: p,context -> p
public:
	APM(U32 n):index(0),N(n),t(n*33)	// maps p,cxt -> p initially
	{
		for(U32 i=0;i<N;++i)
			for(U32 j=0;j<33;++j)
				t[i*33+j] = i==0 ? squash((j-16)*128)*16 : t[j];
	}
	U32 p(U32 pr=2048,U32 cxt=0,U32 rate=8)
	{
		pr=stretch(pr);
		const U32 g=(y<<16)+(y<<rate)-y-y;
		t[index  ] += (g-t[index  ])>>rate;
		t[index+1] += (g-t[index+1])>>rate;
		const U32 w=pr&127;  // interpolation weight (33 points)
		index=((pr+2048)>>7)+cxt*33;
		return (t[index]*(128-w)+t[index+1]*w) >> 11;
	}
};
////////////////////////////////////////////////////////////////////////
#define nex(state,sel) state_table[state][sel]
static const U8 state_table[256][4]={
{  1,  2, 0, 0},{  3,  5, 1, 0},{  4,  6, 0, 1},{  7, 10, 2, 0},//0-3
{  8, 12, 1, 1},{  9, 13, 1, 1},{ 11, 14, 0, 2},{ 15, 19, 3, 0},//4-7
{ 16, 23, 2, 1},{ 17, 24, 2, 1},{ 18, 25, 2, 1},{ 20, 27, 1, 2},//8-11
{ 21, 28, 1, 2},{ 22, 29, 1, 2},{ 26, 30, 0, 3},{ 31, 33, 4, 0},//12-15
{ 32, 35, 3, 1},{ 32, 35, 3, 1},{ 32, 35, 3, 1},{ 32, 35, 3, 1},//16-19
{ 34, 37, 2, 2},{ 34, 37, 2, 2},{ 34, 37, 2, 2},{ 34, 37, 2, 2},//20-23
{ 34, 37, 2, 2},{ 34, 37, 2, 2},{ 36, 39, 1, 3},{ 36, 39, 1, 3},//24-27
{ 36, 39, 1, 3},{ 36, 39, 1, 3},{ 38, 40, 0, 4},{ 41, 43, 5, 0},//28-31
{ 42, 45, 4, 1},{ 42, 45, 4, 1},{ 44, 47, 3, 2},{ 44, 47, 3, 2},//32-35
{ 46, 49, 2, 3},{ 46, 49, 2, 3},{ 48, 51, 1, 4},{ 48, 51, 1, 4},//36-39
{ 50, 52, 0, 5},{ 53, 43, 6, 0},{ 54, 57, 5, 1},{ 54, 57, 5, 1},//40-43
{ 56, 59, 4, 2},{ 56, 59, 4, 2},{ 58, 61, 3, 3},{ 58, 61, 3, 3},//44-47
{ 60, 63, 2, 4},{ 60, 63, 2, 4},{ 62, 65, 1, 5},{ 62, 65, 1, 5},//48-51
{ 50, 66, 0, 6},{ 67, 55, 7, 0},{ 68, 57, 6, 1},{ 68, 57, 6, 1},//52-55
{ 70, 73, 5, 2},{ 70, 73, 5, 2},{ 72, 75, 4, 3},{ 72, 75, 4, 3},//56-59
{ 74, 77, 3, 4},{ 74, 77, 3, 4},{ 76, 79, 2, 5},{ 76, 79, 2, 5},//60-63
{ 62, 81, 1, 6},{ 62, 81, 1, 6},{ 64, 82, 0, 7},{ 83, 69, 8, 0},//64-67
{ 84, 71, 7, 1},{ 84, 71, 7, 1},{ 86, 73, 6, 2},{ 86, 73, 6, 2},//68-71
{ 44, 59, 5, 3},{ 44, 59, 5, 3},{ 58, 61, 4, 4},{ 58, 61, 4, 4},//72-75
{ 60, 49, 3, 5},{ 60, 49, 3, 5},{ 76, 89, 2, 6},{ 76, 89, 2, 6},//76-79
{ 78, 91, 1, 7},{ 78, 91, 1, 7},{ 80, 92, 0, 8},{ 93, 69, 9, 0},//80-83
{ 94, 87, 8, 1},{ 94, 87, 8, 1},{ 96, 45, 7, 2},{ 96, 45, 7, 2},//84-87
{ 48, 99, 2, 7},{ 48, 99, 2, 7},{ 88,101, 1, 8},{ 88,101, 1, 8},//88-91
{ 80,102, 0, 9},{103, 69,10, 0},{104, 87, 9, 1},{104, 87, 9, 1},//92-95
{106, 57, 8, 2},{106, 57, 8, 2},{ 62,109, 2, 8},{ 62,109, 2, 8},//96-99
{ 88,111, 1, 9},{ 88,111, 1, 9},{ 80,112, 0,10},{113, 85,11, 0},//100-03
{114, 87,10, 1},{114, 87,10, 1},{116, 57, 9, 2},{116, 57, 9, 2},//104-07
{ 62,119, 2, 9},{ 62,119, 2, 9},{ 88,121, 1,10},{ 88,121, 1,10},//108-11
{ 90,122, 0,11},{123, 85,12, 0},{124, 97,11, 1},{124, 97,11, 1},//112-15
{126, 57,10, 2},{126, 57,10, 2},{ 62,129, 2,10},{ 62,129, 2,10},//116-19
{ 98,131, 1,11},{ 98,131, 1,11},{ 90,132, 0,12},{133, 85,13, 0},//120-23
{134, 97,12, 1},{134, 97,12, 1},{136, 57,11, 2},{136, 57,11, 2},//124-27
{ 62,139, 2,11},{ 62,139, 2,11},{ 98,141, 1,12},{ 98,141, 1,12},//128-31
{ 90,142, 0,13},{143, 95,14, 0},{144, 97,13, 1},{144, 97,13, 1},//132-35
{ 68, 57,12, 2},{ 68, 57,12, 2},{ 62, 81, 2,12},{ 62, 81, 2,12},//136-39
{ 98,147, 1,13},{ 98,147, 1,13},{100,148, 0,14},{149, 95,15, 0},//140-43
{150,107,14, 1},{150,107,14, 1},{108,151, 1,14},{108,151, 1,14},//144-47
{100,152, 0,15},{153, 95,16, 0},{154,107,15, 1},{108,155, 1,15},//148-51
{100,156, 0,16},{157, 95,17, 0},{158,107,16, 1},{108,159, 1,16},//152-55
{100,160, 0,17},{161,105,18, 0},{162,107,17, 1},{108,163, 1,17},//156-59
{110,164, 0,18},{165,105,19, 0},{166,117,18, 1},{118,167, 1,18},//160-63
{110,168, 0,19},{169,105,20, 0},{170,117,19, 1},{118,171, 1,19},//164-67
{110,172, 0,20},{173,105,21, 0},{174,117,20, 1},{118,175, 1,20},//168-71
{110,176, 0,21},{177,105,22, 0},{178,117,21, 1},{118,179, 1,21},//172-75
{110,180, 0,22},{181,115,23, 0},{182,117,22, 1},{118,183, 1,22},//176-79
{120,184, 0,23},{185,115,24, 0},{186,127,23, 1},{128,187, 1,23},//180-83
{120,188, 0,24},{189,115,25, 0},{190,127,24, 1},{128,191, 1,24},//184-87
{120,192, 0,25},{193,115,26, 0},{194,127,25, 1},{128,195, 1,25},//188-91
{120,196, 0,26},{197,115,27, 0},{198,127,26, 1},{128,199, 1,26},//192-95
{120,200, 0,27},{201,115,28, 0},{202,127,27, 1},{128,203, 1,27},//196-99
{120,204, 0,28},{205,115,29, 0},{206,127,28, 1},{128,207, 1,28},//200-03
{120,208, 0,29},{209,125,30, 0},{210,127,29, 1},{128,211, 1,29},//204-07
{130,212, 0,30},{213,125,31, 0},{214,137,30, 1},{138,215, 1,30},//208-11
{130,216, 0,31},{217,125,32, 0},{218,137,31, 1},{138,219, 1,31},//212-15
{130,220, 0,32},{221,125,33, 0},{222,137,32, 1},{138,223, 1,32},//216-19
{130,224, 0,33},{225,125,34, 0},{226,137,33, 1},{138,227, 1,33},//220-23
{130,228, 0,34},{229,125,35, 0},{230,137,34, 1},{138,231, 1,34},//224-27
{130,232, 0,35},{233,125,36, 0},{234,137,35, 1},{138,235, 1,35},//228-31
{130,236, 0,36},{237,125,37, 0},{238,137,36, 1},{138,239, 1,36},//232-35
{130,240, 0,37},{241,125,38, 0},{242,137,37, 1},{138,243, 1,37},//236-39
{130,244, 0,38},{245,135,39, 0},{246,137,38, 1},{138,247, 1,38},//240-43
{140,248, 0,39},{249,135,40, 0},{250, 69,39, 1},{ 80,251, 1,39},//244-47
{140,252, 0,40},{249,135,41, 0},{250, 69,40, 1},{ 80,251, 1,40},//248-51
{140,252, 0,41}}; // 252,253-255 are reserved
////////////////////////////////////////////////////////////////////////
// A StateMap maps a nonstationary counter state to a probability.
// After each mapping,the mapping is adjusted to improve future
// predictions.  Methods:
// sm.p(cx) converts state cx (0-255) to a probability (0-4095).
// Counter state -> probability * 256
class StateMap
{
	U32 cxt;		// context
	Array<U16> t;	// 256 states -> probability * 64K
public:
	StateMap():cxt(0),t(256)
	{
		for(U32 i=0;i<256;++i)
		{
			U32 n0=nex(i,2);
			U32 n1=nex(i,3);
			if(!n0) n1*=256;	//128
			if(!n1) n0*=256;	//128
			t[i]=65536*(n1+1)/(n0+n1+2);
		}
	}
	U32 p(U32 cx)
	{
		t[cxt]+=((y<<16)-t[cxt]+128)>>8;
		return t[cxt=cx]>>4;
	}
};
////////////////////////////////////////////////////////////////////////
inline U32 hash(U32 a,U32 b,U32 c=~0,U32 d=~0,U32 e=~0) // ~0
{
	U32 h=a*200002979u+b*30005491u+c*50004239u+d*70004807u+e*110002499u;
	return h;//^h>>9^a>>2^b>>3^c>>4^d>>5^e>>6;
}
////////////////////////////////////////////////////////////////////////
// A ContextMap maps contexts to a bit histories and makes predictions
// to a Mixer.	Methods common to all classes:
// ContextMap cm(M,C); creates using about M bytes of memory (a power
//	 of 2) for C contexts.
// cm.set(cx);	sets the next context to cx,called up to C times
//	 cx is an arbitrary 32 bit value that identifies the context.
//	 It should be called before predicting the first bit of each byte.
// cm.mix(m) updates Mixer m with the next prediction.	Returns 1
//	 ifcontext cx is found,else 0. Then it extends all the contexts with
//	 global bit y.	It should be called for every bit:
//	   if(bpos==0)
//		 for(int i=0; i<C; ++i) cm.set(cxt[i]);
//	   cm.mix(m);
// The different types are as follows:
// - SmallStationaryContextMap.  0 <= cx < M/512.
//	   The state is a 16-bit probability that is adjusted after each
//	   prediction.	C=1.
// - ContextMap.  For large contexts,C >= 1. Context need not be hashed.
// Predict to mixer m from bit history state s,using sm to map s to
// a probability.
inline U32 mix2(U32 s,StateMap &sm)
{
	U32 p1=sm.p(s);
	const U32 n0=-!nex(s,2);
	const U32 n1=-!nex(s,3);
	const int st=stretch(p1)>>2;
	m.add(st);
	p1>>=4;
	const U32 p0=255-p1;
	m.add((p1-p0));
	m.add((n1-n0)*st);
	m.add((p1&n0)-(p0&n1));
	m.add((p1&n1)-(p0&n0));
	return s>0;
}
////////////////////////////////////////////////////////////////////////
// Context is looked up directly.  m=size is power of 2 in bytes.
// Context should be < m/512.  High bits are discarded.
class SmallStationaryContextMap
{
	Array<U16> t;
	U32 cxt;
	U16 *cp;
public:
	SmallStationaryContextMap(U32 n):t(n/2),cxt(0)
	{
		for(U32 i=0;i<t.size();++i) t[i]=32768;
		cp=&t[0];
	}
	void set(U32 cx)
	{
		cxt=(cx<<8)&(t.size()-256);
	}
	void mix(U32 rate=7)
	{
		*cp += ((y<<16)-*cp+(1<<(rate-1))) >> rate;
		cp=&t[cxt+c0];
		m.add(stretch(*cp>>4));
	}
};
////////////////////////////////////////////////////////////////////////
// Context map for large contexts.Most modeling use this type of context
// map.It includes a built in RunContextMap to predict last byte seen
// in the same context,and also bit-level contexts that map to a bit
// history state.
// Bit histories are stored in a hash table. The table is organized into
// 64-byte buckets alinged on cache page boundaries.Each bucket contains
// a hash chain of 7 elements,plus a 2 element queue(packed into 1 byte)
// of the last 2 elements accessed for LRU replacement. Each element has
// a 2 byte checksum for detecting collisions,and an array of 7b history
// states indexed by last 0~2 bits of context.The buckets are indexed
// by a context ending after 0,2,or 5 bits of current byte.Thus,each
// byte modeled results in 3 main memory accesses per context,with all
// other accesses to cache.
// On bits 0,2 and 5,the context is updated and a new bucket is selected
// The most recently accessed element is tried first,by comparing the
// 16 bit checksum,then the 7 elements are searched linearly.If no match
// is found,then the element with the lowest priority among 5 elements
// not in the LRU queue is replaced.  After a replacement,the queue is
// emptied (so that consecutive misses favor a LFU replacement policy).
// In all cases,the found/replaced element is put in front of the queue.
// The priority is the state number of the first element (the one with 0
// additional bits of context).The states are sorted by increasing n0+n1
// (number of bits seen),implementing a LFU replacement policy.
// When the context ends on a byte boundary (bit 0),only 3 of the 7 bit
// history states are used.  The remaining 4 bytes implement a run model
// as follows: <count:7,d:1> <b1> <b2> <b3> where <b1> is the last byte
// seen,possibly repeated,and <b2> and <b3> are the two bytes seen
// before the first <b1>.  <count:7,d:1> is a 7 bit count and a 1 bit
// flag.  If d=0 then <count> = 1..127 is the number of repeats of <b1>
// and no other bytes have been seen,and <b2><b3> are not used.
// If <d> = 1 then the history is <b3>,<b2>,and <count> - 2 repeats
// of <b1>.  In this case,<b3> is valid only if<count> >= 3 and
// <b2> is valid only if<count> >= 2.
// As an optimization, last two hash elements of each byte(representing
// contexts with 2-7 bits) are not updated until a context is seen for
// a second time.  This is indicated by <count,d> = <1,0>.After update,
// <count,d> is updated to <2,0> or <2,1>.
class ContextMap
{
	class E 		// hash element,64 bytes
	{
		U16 chk[7]; // byte context checksums
		U8 last;	// last 2 accesses (0-6) in low,high nibble
	public:
		U8 bh[7][7];// byte context,3-bit context -> bit history state
		// bh[][0] = 1st bit,bh[][1,2] = 2nd bit,bh[][3..6] = 3rd bit
		// bh[][0] is also a replacement priority,0 = empty
		U8* get(U16 chk);  // Find element (0-6) matching checksum.
		// If not found,insert or replace lowest priority (not last).
	};
	Array<E> t; 	// bit histories for bits 0-1,2-4,5-7
	// For 0-1,also contains a run count in bh[][4] and value in bh[][5]
	// and pending update count in bh[7]
	Array<U8*> cp;	// C pointers to current bit history
	Array<U8*> cp0; // First element of 7 element array containing cp[i]
	Array<U32> cxt; // C whole byte contexts (hashes)
	Array<U8*> runp;// C [0..3] = count,value,unused,unused
	StateMap *sm;	// C maps of state -> p
	U32 cn; 		// Next context to set by set()
public:
	ContextMap(U32 n,U32 c=1);	//m = memory in bytes,a power of 2,C = c
	~ContextMap()		{delete []sm;}
	void set(U32 cx);	// set next whole byte context
	U32 mix();
};
ContextMap::	// Construct using m bytes of memory for c contexts
ContextMap(U32 n,U32 c):t(n>>6,64),cp(c),cp0(c),cxt(c),runp(c),cn(0)
{
	sm=new StateMap[c];
	for(U32 i=0;i<c;++i)
	{
		cp0 [i]=cp[i]=&t[0].bh[0][0];
		runp[i]=cp[i]+3;
	}
}
// Find or create hash element matching checksum ch
inline U8* ContextMap::E::get(U16 ch)
{
	if(chk[last&15]==ch) return &bh[last&15][0];
	U32 b=0xffff,bi=0;
	for(U8 i=0;i<7;++i)
	{
		if(chk[i]==ch) return last=last<<4|i,&bh[i][0];
		const U32 pri=bh[i][0];
		if((last&15)!=i && last>>4!=i && pri<b) b=pri,bi=i;
	}
	return last=0xf0|bi,chk[bi]=ch,(U8*)memset(&bh[bi][0],0,7);
}
inline void ContextMap::set(U32 cx) // Set the i'th context to cx
{
	const U32 i=cn++;
	cx=cx*987654323+i; // permute (don't hash) cx to spread distribution
	cx=cx<<16|cx>>16;
	cxt[i]=cx+i;
}
// Update the model with bit y1,and predict next bit to mixer m.
U32 ContextMap::mix()	// Update model with y
{
	const U32 bp=bpos,cc=c0;
	U32 ret=0;
	for(U32 i=0;i<cn;++i)
	{
		if(cp[i]) *cp[i]=nex(*cp[i],y);
		// Update context pointers
			 if(bp >1&&runp[i][0]==0)	cp[i]=0;
		else if(bp==1||bp==3||bp==6)	cp[i]=cp0[i]+1+(cc&1);
		else if(bp==4||bp==7)			cp[i]=cp0[i]+3+(cc&3);
		else
		{
			cp0[i]=cp[i]=t[cxt[i]+cc&t.size()-1].get(cxt[i]>>16);
			if(bp==0)	// Update pending bit histories for bits 2-7
			{
				if(cp0[i][3]==2)
				{
					const U32 c=cp0[i][4]+256; U8 *p;
					p=t[cxt[i]+(c>>6)&t.size()-1].get(cxt[i]>>16);
					p[0 		  ]=((c>>5)&1)+1;
					p[1+((c>>5)&1)]=((c>>4)&1)+1;
					p[3+((c>>4)&3)]=((c>>3)&1)+1;
					p=t[cxt[i]+(c>>3)&t.size()-1].get(cxt[i]>>16);
					p[0 		  ]=((c>>2)&1)+1;
					p[1+((c>>2)&1)]=((c>>1)&1)+1;
					p[3+((c>>1)&3)]=( c    &1)+1;
					cp0[i][6]=0;
				}
				// Update run count of previous context
				if(!runp[i][0]) 		// new context
					runp[i][0]=2,runp[i][1]=c1;
				else if(runp[i][1]!=c1) // different byte in context
					runp[i][0]=1,runp[i][1]=c1;
				else if(runp[i][0]<254) // same byte in context
					runp[i][0]+=2;
				runp[i]=cp0[i]+3;
			}
		}
		// predict from last byte in context
		if(((U32)runp[i][1]+256)>>(8-bp)==cc)//predicted bit +for1,-for0
		{
			U32 rc=runp[i][0];// count*2,+1 if 2 different bytes seen
			m.add(((runp[i][1]>>(7-bp)&1)*2-1)*ilog(rc+1)<<(2+(~rc&1)));
		}
		else
			m.add(0);
		// predict from bit context
		ret+=mix2(cp[i]?*cp[i]:0, sm[i]);
	}
	if(bp==7) cn=0;
	return ret;
}
////////////////////////////////////////////////////////////////////////
//matchModel() finds the longest matching context and returns its length
U32 matchModel()
{
	const  U32 MAXLEN=2047; 	// longest allowed match + 1
	static Array<U32> t(MEM);	// hash table of pointers to contexts
	static U32 h  =0;			// hash of last 7 bytes
	static U32 ptr=0;			// points to next byte of match if any
	static U32 len=0;			// length of match,or 0 ifno match
	static U32 ret=0;
	if(!bpos)
	{
		h=h*997*8+c1+1&t.size()-1; // update context hash
		if(len) ++len,++ptr;
		else		// find match
		{
			ptr=t[h];
			if(ptr && pos-ptr<buf.size())
				while(buf(len+1)==buf[ptr-len-1] && len<MAXLEN) ++len;
		}
		t[h]=pos;	// update hash table
		ret =len;
//if(ret>0&&!(ret&0xfff)) printf("pos=%d len=%d ptr=%d\n",pos,len,ptr);
	}
	// predict
	if(len>MAXLEN) len=MAXLEN;
	int sgn;
	if(len && c1==buf[ptr-1] && c0==((U32)buf[ptr]+256)>>(8-bpos))
	{
		if(buf[ptr]>>(7-bpos)&1) sgn=1;
		else sgn=-1;
	}
	else sgn=len=0;
	m.add(sgn* 4*ilog(len));
	m.add(sgn*64*(len<32?len:32));	//112
	return ret;
}
////////////////////////////////////////////////////////////////////////
// Model 2-D data with fixed record length. Also order 1-2 models
// that include the distance to the last match.
void recordModel()
{										//buf(1)->last 3 pos
	static Array<U32> cpos1(256),cpos2(256),cpos3(256),cpos4(256);
	static U32 rlen=2,rlen1=3,rlen2=4;	// run length and 2 candidates
	static U32 rcount1=0,rcount2=0; 	// candidate counts
	static ContextMap cm(MEM*4,4);
	if(!bpos)
	{
		const U32 c=c1,r=pos-cpos1[c];
		if(r>1 && r==cpos1[c]-cpos2[c]
			   && r==cpos2[c]-cpos3[c]
			   && r==cpos3[c]-cpos4[c]
			   && (r>15 || (c==buf(r*5+1)) && c==buf(r*6+1)))
		{
				 if(r==rlen1) ++rcount1;
			else if(r==rlen2) ++rcount2;
			else if(rcount1>rcount2) rlen2=r,rcount2=1;
			else rlen1=r,rcount1=1;
		}
		if(rcount1>15 && rlen!=rlen1) rlen=rlen1,rcount1=rcount2=0;
		if(rcount2>15 && rlen!=rlen2) rlen=rlen2,rcount1=rcount2=0;
		const U32 col=pos%rlen;
		cm.set(c1<<8|buf(rlen));					//27
		cm.set(rlen|buf(rlen)<<10|buf(rlen*2)<<18); //25
		cm.set(rlen|c1		 <<10|col<<18); 		//34
		cm.set( col|rlen<<12);						//14
		cpos4[c]=cpos3[c];	// update last context positions
		cpos3[c]=cpos2[c];
		cpos2[c]=cpos1[c];
		cpos1[c]=pos;
	}
	cm.mix();
}
////////////////////////////////////////////////////////////////////////
// Model order 1-2 contexts with gaps.
void sparseModel()
{
	static ContextMap cm(MEM*4,7),scm(MEM,4);
	if(!bpos)
	{
		 cm.set(c4&0x00ff00ff); 	//16
		 cm.set(c4&0xff0000ff); 	//18
		 cm.set(c4&0x00ffff00); 	//29
		 cm.set(c1|buf(5)<<8);		//16
		 cm.set(c1|buf(6)<<8);		//17
		 cm.set(c3|buf(6)<<8);		//14
		 cm.set(c4>>24|buf(8)<<8);	//18
		scm.set(c1);				//70
		scm.set(c2);				//39
		scm.set(c3);				//14
		scm.set(buf(8));			//39
	}
	 cm.mix();
	scm.mix();
}
////////////////////////////////////////////////////////////////////////
// Model English text (words and columns/end of line)
void wordModel()
{
	static U32 word0=0;
	static U32 text0=0; // hash stream of letters
	static ContextMap cm(MEM*16,4);
	if(!bpos)
	{
		U32 c=c1;
		if(c-'A'<='Z'-'A') c+='a'-'A';
		if(c-'a'<='z'-'a')
		{
			word0=word0*263* 4+c;
			text0=text0*997*16+c;
		}
		else word0=0;
		cm.set(word0);					//102
		cm.set(text0&0xffffff); 		//23
		cm.set(c4&0xffff0000);			//25
		cm.set(c4&0xff00ff00|buf(6));	//22
	}
	cm.mix();
}
////////////////////////////////////////////////////////////////////////
// The context is 1B string history that occurs within a 1/2 B context.
void indirectModel()
{
	static ContextMap cm(MEM*4,2);
	static Array<U32> t1(256);
	if(!bpos)
	{
		U32 &r1=t1[c2]; r1=r1<<8|c1;
		const U32 t=c1|t1[c1]<<8;
		cm.set(t&0xffff);	//13
		cm.set(t&0xffffff); //12
	}
	cm.mix();
}
////////////////////////////////////////////////////////////////////////
// Model a 24-bit color uncompressed .bmp or .tiffile.
// Return width in pixels ifan image file is detected,else 0.
// 32-bit little endian number at buf(i)..buf(i-3)
inline U32 i4(int i)
{
	return buf(i)+256*buf(i-1)+65536*buf(i-2)+16777216*buf(i-3);
}
inline U32 i2(int i)	// 16-bit
{
	return buf(i)+256*buf(i-1);
}
inline U32 sqrbuf(int i)// Square buf(i)
{
	return buf(i)*buf(i);
}
int bmpModel()
{
	static U32 w   =0;	// width of image in bytes (pixels * 3)
	static U32 eoi =0;	// end of image
	static U32 tiff=0;	// offset of tifheader
	const  U32 SC  =0x20000;
	static SmallStationaryContextMap scm1(SC),scm2(SC),
				   scm3(SC),scm4(SC),scm5(SC),scm6(SC*2);
	static ContextMap cm(MEM*4,8);
	// Detect .bmp file header (24 bit color,not compressed)
	if(!bpos && buf(54)=='B' && buf(53)=='M' &&
				 i4(44)==54  &&  i4(40)==40  &&
				 i2(26)==24  &&  i4(24)==0	 )
	{
		w=(i4(36)+3&-4)*3;	// image width
		const U32 height=i4(32);
		if(w<0x30000 && height<0x10000)
		{
			eoi=pos+w*height;  // image size in bytes
//			printf("BMP %dx%d\n",w/3,height);
		}
		else eoi=pos;
	}
	// Detect .tiffile header (24 bit color,not compressed).
	// Parsing is crude,won't work with weird formats.
	if(!bpos)
	{
		if(c4==0x49492a00) tiff=pos;  // Intel format only
		if(pos-tiff==4&&c4!=0x08000000) tiff=0;//8=normal offset to dir
		if(tiff && pos-tiff==200)//most directory should be read by now
		{
			int dirsize=i2(pos-tiff-4); // number of 12B dir entries
			int bpp=0,compression=0,width=w=0,height=0;
			for(U32 i=tiff+6; i<pos-12 && --dirsize>0; i+=12)
			{
				int tag=i2(pos-i);//256=width,257=height,259:1=nocompre
				// 277=3 samples/pixel
				int tagfmt=i2(pos-i-2); // 3=short,4=long
				int taglen=i4(pos-i-4); // number of elements in tagval
				int tagval=i4(pos-i-8); //1long,1-2short,or points2array
				if((tagfmt==3||tagfmt==4) && taglen==1)
				{
					if(tag==256) width=tagval;
					if(tag==257) height=tagval;
					if(tag==259) compression=tagval; // 1=no compression
					if(tag==277) bpp=tagval;  // should be 3
				}
			}
			if(width>0 && height>0 && width*height>50 && compression==1
				&& (bpp==1||bpp==3))
				eoi=tiff+width*height*bpp,w=width*bpp;
			if(eoi<=pos) tiff=w=0;
//			else printf("TIFF %dx%dx%d\n",width,height,bpp);
		}
	}
	if(pos>eoi) return w=0;
	if(!bpos)	// Select nearby pixels as context
	{
		const U32 color=pos%3;
		U32 mean=c3+buf(w-3)+buf(w)+buf(w+3);
		const U32 var=(sqrbuf(3)+sqrbuf(w-3)+sqrbuf(w)+sqrbuf(w+3)-
													mean*mean/4)>>2;
		mean>>=2;
		const U32 logvar=ilog(var);
		U32 i=0;
		 cm .set(hash(++i,c3>>2,buf(w)>>2,color));
		 cm .set(hash(++i,c3>>2,	c1>>2,color));
		 cm .set(hash(++i,c3>>2,	c2>>2,color));
		 cm .set(hash(++i,buf(w)>>2,c1>>2,color));
		 cm .set(hash(++i,buf(w)>>2,c2>>2,color));
		 cm .set(hash(++i, (c3+buf(w))>>1,color));
		 cm .set(hash(++i, (c3+buf(w))>>3,c1>>5,c2>>5,color));
		 cm .set(hash(++i,mean, logvar>>5,color));
		scm1.set((c3  +buf(w))>>1);
		scm2.set((c3  +buf(w)-buf(w+3))>>1);
		scm3.set((c3*2-buf(6))>>1);
		scm4.set((buf(w)*2-buf(w*2))>>1);
		scm5.set((c3  +buf(w)-buf(w-3))>>1);
		scm6.set(mean>>1|logvar<<1&0x180);
	}
	 cm .mix();
	scm1.mix();
	scm2.mix();
	scm3.mix();
	scm4.mix();
	scm5.mix();
	scm6.mix();
	return w;
}
////////////////////////////////////////////////////////////////////////
// Model x86 code.	The contexts are sparse containing only those bits
// relevant to parsing (2 prefixes,opcode,and mod and r/m fields of mod
// RM byte). Get context at buf(i) relevant to parsing 32-bit x86 code
__declspec(naked) U32 __stdcall bswap(U32 x)
{
	__asm mov eax,[esp+4]
	__asm bswap eax
	__asm ret 4
}
void exeModel()
{
	const U32 N=12;
	static ContextMap cm(MEM*2,N);
	static U32 state=0,base=0,aim=0,len=0;
	if(!bpos)
	{
		U32 last4=bswap(c4);
		switch(state)
		{
		case 0: if((last4&0xffff)==0x5a4d)
				{
					state=1;
					base=pos-4;
					aim=base+0x40;
				}return;
		case 1: if(pos>=aim)
					if(pos>aim||last4+4<pos-base||last4>0x1000) state=0;
					else {state=2;aim=base+last4+4;}
				return;
		case 2: if(pos>=aim)
					if(pos>aim||last4!=0x4550) state=0;
					else {state=3;aim+=0x108;}
				return;
		case 3: if(pos>=aim)
					if(pos>aim||!last4||last4>MEM*8) state=0;
					else {state=4;aim+=4;len=last4;}
				return;
		case 4: if(pos>=aim)
					if(pos>aim||last4<=pos-base||last4>0x10000) state=0;
					else {state=5;aim=base+last4;}
				return;
		case 5: if(pos>=aim) {state=6;aim+=len;type=1;}
//printf("pos=%p,aim=%p,len=%p\n",pos,aim,len);
				return;
		case 6: if(pos>=aim) {state=0;type=0;return;}
				else for(U32 i=0;i<N;++i)
				{
					U32 prefix=(buf(i+2)==0x0f)  +(buf(i+2)==0x66)* 2+
							   (buf(i+2)==0x67)*3+(buf(i+3)==0x0f)* 4+
							   (buf(i+3)==0x66)*8+(buf(i+3)==0x67)*12;
					U32 opcode=buf(i+1);
					U32 modrm =i ? buf(i)&0xc7 : 0;
					cm.set(prefix|opcode<<4|modrm<<12|c1*(i>4)<<20);
				}
		}
	}
	if(state==6) cm.mix();
}
////////////////////////////////////////////////////////////////////////
// file types (order is important: the last will be sorted by filetype
// detection as the first)This combines all context models with a Mixer.
U32 contextModel()
{
	static ContextMap cm(MEM*32,5);
	static U32 cxt[9];
	m.update();
	m.add(256);
	U32 is=matchModel();	// Length of longest matching context
	if(is>=100)
	{
		m.set(0,8);
		return m.p();
	}
	is=bmpModel();	// Image width (B) if BMP or TIFF detected,or 0
	if(is>0)
	{
		static U32 col=0;
		if(++col>=24) col=0;
		m.set(2,8);
		m.set(col,24);
		m.set((buf(is)+c3)>>4,32);
		m.set(c0,256);
		return m.p();
	}
	if(!bpos)
	{
		for(U32 i=8;i;--i) cxt[i]=cxt[i-1]*257+c1+1;
		cm.set(cxt[0]); 			//41
		cm.set(cxt[2]); 			//33
		cm.set(cxt[3]); 			//18
		cm.set(cxt[5]); 			//11
		if(type!=1) cm.set(cxt[8]); //13
	}
	const U32 order=cm.mix();
	sparseModel();		//396
	if(type!=1)
	{
		recordModel();	//177
		wordModel();	//190
	}
	indirectModel();	//45
	exeModel();
	m.set(8+order+(c4&0xe0)+(c1==c2)*16,256);
	m.set(c0* 4+(c1/64),1024);			//37
	m.set(c1/32+(c2/32)*8+(c3&192),256);//48
	return m.p();
}
////////////////////////////////////////////////////////////////////////
class Predictor
{
	U32 p;
public:
	Predictor():p(2048) {}
	U32 predict() const {return p;} // 0 ~ 4095
	void update();
};
////////////////////////////////////////////////////////////////////////
void Predictor::update()
{
	static APM a1(256),a2(65536);
	c0+=c0+y;
	if(c0>=256)
	{
		buf[pos++]=c0;
		c3=c2; c2=c1; c1=c0-256;
		c4=(c4<<8)+c1; c0=1;
	}
	bpos=(bpos+1)&7;
	p=contextModel();
	p=(a1.p(p,c0)*3+p+2)>>2;
	p= a2.p(p,c0+c1*256);
}
////////////////////////////////////////////////////////////////////////
class Coder
{
public:
	enum		Mode{ENC,DEC};
private:
	Predictor	pred;
	U32 		x0,x1,x;
	FILE		*fp;
	Mode		mode;
	enum		{OUTBUFSIZE=8192};
	U32 		outpos;
	U8			outbuf[OUTBUFSIZE];
	void encode(U32 b)
	{
		U32 p=pred.predict(); p+=(p<2048); // 1 ~ fff
		U32 m=x1-x0; m=x0+(m>>12)*p+((m&0xfff)*p>>12);
		if((y=b)) x1=m; else x0=m+1;
		pred.update();
		while(!((x0^x1)&0xff000000))
		{
			outbuf[outpos++]=x1>>24;
			if(outpos==OUTBUFSIZE)
			{
				fwrite(outbuf,1,OUTBUFSIZE,fp);
				outpos=0;
			}
			x0<<=8;
			x1=(x1<<8)+255;
		}
	}
	U32 decode()
	{
		U32 p=pred.predict(); p+=(p<2048); // 1 ~ fff
		U32 m=x1-x0; m=x0+(m>>12)*p+((m&0xfff)*p>>12);
		if((y=x<=m)) x1=m; else x0=m+1;
		pred.update();
		while(!((x0^x1)&0xff000000))
		{
			x=(x<<8)+(fgetc(fp)&255);
			x0<<=8;
			x1=(x1<<8)+255;
		}
		return y;
	}
public:
	Coder(Mode m,FILE *f):mode(m),fp(f),x0(0),x1(-1),x(0),outpos(0)
	{
		if(mode==DEC)
			for(U32 i=4;i;--i)
				x=(x<<8)+(fgetc(fp)&255);
	}
	void reset() {type=0;}
	void enc(U8 c)
	{
		for(U32 i=8;i;) encode((c>>--i)&1);
	}
	U8 dec()
	{
		U8 c=0;
		for(U32 i=8;i;--i) c+=c+decode();
		return c;
	}
	void flush()
	{
		if(mode==ENC)
		{
			outbuf[outpos++]=x0>>24;
			fwrite(outbuf,1,outpos,fp);
			outpos=0;
		}
	}
};
////////////////////////////////////////////////////////////////////////
