#pragma pack(1)

#include "copro.h"
#include "string.h"
#include "tinyrxm.h"


void *getmem(int size);
extern void *globalHeap;
extern char mucke;
#pragma aux mucke "*"

#define PBlocks 16
typedef struct {
  float C[512];
  float M[32][64];
  float buf[512];
  float st[PBlocks][32];
} tpackdata;

float U[] = {
  0.124996, -0.250000, 0.249061, -0.234831,
  0.176782, -0.085736, 0.021643, -0.001933};

void initdepack(tpackdata *p) {
  int i, k, n;
  float DO, f;

  //calc C
  DO = 2.0*pi/512.0;
  for (k = 0; k < 512; k++) {
    f = 0.0;
    for (n = 0; n < 8; n++) f += U[n]*cos((k)*n*DO);
    if (k & 64) f = -f;
    p->C[k] = f;
  }

  //calc M
  for (i = 0; i < 32; i++) {
    for (k = 0; k < 64; k++) {
      p->M[i][k] = cos((2*i + 1)*(k - 16)*pi/64.0);
    }
  }

  bzero(p->buf, 512*4);
}

int readst(tpackdata *p, signed char *pdata, int blocks) {
  signed char *pbegin;
  int ctrl1, ctrl2, ctrl3;
  int i, z, bit;
  int bits, shift;//, mask;
  int c, s;
  signed char byte;

  pbegin = pdata;

  ctrl1 = *(int *) pdata;
  pdata += 4;
  ctrl2 = *(int *) pdata;
  pdata += 4;
  ctrl3 = *(int *) pdata;
  pdata += 4;

  for (i = 0; i < 32; i++) {
    bit = 1 << i;

    if (ctrl1 & bit && ctrl2 & bit) {
      bits = 4;
      shift = 4;
    } else if (ctrl1 & bit && ctrl3 & bit) {
      bits = 4;
      shift = 3;
    } else if (ctrl2 & bit && ctrl3 & bit) {
      bits = 4;
      shift = 2;
    } else if (ctrl1 & bit) {
      bits = 2;//4;
      shift = 2;//1;
    } else if (ctrl2 & bit) {
      bits = 2;
      shift = 1;
    } else if (ctrl3 & bit) {
      bits = 2;
      shift = 0;
    } else {
      bits = 0;
    }

    if (bits > 0) {
      c = 0;
      for (z = 0; z < blocks; z++) {
        if (c == 0) {
          byte = *pdata;
          pdata++;
          c = 8;
        }
        c -= bits;
        s = byte;
        s = (((s << bits) & ~0xFF) | 0x80) << shift;
        p->st[z][i] = s/256.0;
        byte <<= bits;
      }
    } else {
      for (z = 0; z < blocks; z++) p->st[z][i] = 0.0;
    }
  }

  return pdata - pbegin;
}

void calcout(tpackdata *p, float *st) {
  int k, i, j;
  float f;

  for (k = 0; k < 64; k++) {
    f = 0.0;
    for (i = 0; i < 32; i++) f += st[i]*p->M[i][k];
    for (j = 0; j < 8; j++) p->buf[k + 64*j] += p->C[k + 64*j]*f;
  }
}

//this depacks and starts the tune
//the memory of tpackdata is only needed during this procedure
//void depack(tpackdata *p, char *tune) {
void startxm() {
  tpackdata *p;
  char *tune;
  int datalen;
  int blocks;
  signed char *pdata;
  signed char *sdata, *sdpos;
  int stpos, z, s;

tune = &mucke;
  //first int = xm data length without samples
  datalen = *(int *) tune;
  //second int = number of blocks
  blocks = *(int *) (tune + 4 + datalen);
  pdata = (signed char *) (tune + 4 + datalen + 4);
  //allocate mem for depacked samples
  sdpos = sdata = (signed char *) getmem(blocks*32);

p = globalHeap;//(tpackdata *) getmem(sizeof(tpackdata));
  initdepack(p);

  stpos = PBlocks;
  while (blocks > 0) {
    if (stpos == PBlocks) {
      pdata += readst(p,pdata,(blocks > PBlocks) ? PBlocks : blocks);
      stpos = 0;
    }

    calcout(p,p->st[stpos]);
    for (z = 0; z < 32; z++) {
      s = floor(p->buf[511 - z]);
      if (s > 127) s = 127;
      if (s < -128) s = -128;
//printf("%4d",s);
      *sdpos = s;
      sdpos++;
    }
    for (z = 512 - 32 - 1; z >= 0; z--) {
      p->buf[z + 32] = p->buf[z];
    }
    for (z = 0; z < 32; z++) p->buf[z] = 0.0;

    stpos++;
    blocks--;
  }
  i8_init();
  rxmplay(tune + 4, sdata);
}

void stopxm() {
  rxmstop(xmStop);
  i8_done();
}