/*
 * jpeg-f.cc --
 *
 *      FIXME: This file needs a description here.
 */

/*
 * This code is derived from the Independent JPEG Group's JPEG software:
 *
 * Copyright (C) 1991, 1992, Thomas G. Lane.
 * This file is part of the Independent JPEG Group's software.
 * For conditions of distribution and use, see the accompanying
 * README.IJPG file.
 */

/*FIXME this doesn go here */
#define DESCALE(x,n)  (((x) + (1 << ((n) - 1))) >> (n))
#define CONST_BITS 13
#define MULTIPLY(a, b) ((a) * (b))
#define CONST_SCALE (1 << CONST_BITS)
#define FIX(x)	((int)((x) * CONST_SCALE + 0.5))
#define PASS1_BITS 2

#include "jpeg.h"
#include "endian.h"

#include <stdlib.h>
#include <stdio.h>
#include <sys/param.h>
#include <netinet/in.h>

/*
 * Ck = cos(k pi / 16)
 * Sk = sin(k pi / 16)
 */
#define C1 0.98078528
#define C2 0.92387953
#define C3 0.83146961
#define C4 0.70710678
#define C5 0.55557023
#define C6 0.38268343
#define C7 0.19509032
#define S1 C7
#define S3 C5
#define S6 C2
#define C_1 C1
#define S_1 (-S1)
#define C_3 C3
#define S_3 (-S3)

void idct(short* data, u_int* mask, u_char* p, int stride);
void v_rdct(short* data, u_int* mask, u_char* p, int stride, const int* qt);

#if defined(__osf__) || defined(sgi)
#include <string.h>
#ifdef sgi
#include <bstring.h>
#endif
#elif !defined(__svr4__)
#include <bstring.h>
#endif

#define u_word u_int

/*
 * These two macros stolen from nv.
 */
/* Sick little macro which will limit x to [0..255] with logical ops */
#define UCLIMIT(x, t) ((t = (x)), (t &= ~(t>>31)), (t | ~((t-256) >> 31)))
/* A variant of above which will limit x to [-128..127] */
#define SCLIMIT(x, t) (UCLIMIT((x)+128, t)-128)

/*
 * ZAG[i] is the natural-order position of the i'th element of zigzag order.
 * If the incoming data is corrupted, huff_decode_mcu could attempt to
 * reference values beyond the end of the array.  To avoid a wild store,
 * we put some extra zeroes after the real entries.
 */
#ifdef notdef
static const int ZAG[] = {
	0,  1,  8, 16,  9,  2,  3, 10,
	17, 24, 32, 25, 18, 11,  4,  5,
	12, 19, 26, 33, 40, 48, 41, 34,
	27, 20, 13,  6,  7, 14, 21, 28,
	35, 42, 49, 56, 57, 50, 43, 36,
	29, 22, 15, 23, 30, 37, 44, 51,
	58, 59, 52, 45, 38, 31, 39, 46,
	53, 60, 61, 54, 47, 55, 62, 63,
	/* extra entries in case k>63 below */
	0,  0,  0,  0,  0,  0,  0,  0,
	0,  0,  0,  0,  0,  0,  0,  0
};
#else
/* column order */
static const int ZAG[] = {
0, 8, 1, 2, 9, 16, 24, 17,
10, 3, 4, 11, 18, 25, 32, 40,
33, 26, 19, 12, 5, 6, 13, 20,
27, 34, 41, 48, 56, 49, 42, 35,
28, 21, 14, 7, 15, 22, 29, 36,
43, 50, 57, 58, 51, 44, 37, 30,
23, 31, 38, 45, 52, 59, 60, 53,
46, 39, 47, 54, 61, 62, 55, 63,
	/* extra entries in case k>63 below */
	0,  0,  0,  0,  0,  0,  0,  0,
	0,  0,  0,  0,  0,  0,  0,  0
};
#endif

class JpegDecoder_411 : public JpegDecoder {
 public:
	JpegDecoder_411(const config&);
	virtual int decode(u_char* in, int len);
};

class JpegDecoder_422 : public JpegDecoder {
 public:
	JpegDecoder_422(const config&);
	virtual int decode(u_char* in, int len);
};

JpegDecoder_411::JpegDecoder_411(const config& c) : JpegDecoder(c)
{
	decimation_ = 411;
}

JpegDecoder_422::JpegDecoder_422(const config& c) : JpegDecoder(c)
{
	decimation_ = 422;
}

JpegDecoder* JpegDecoder::create(const config& c)
{
	if (c.ncomp == 3 && c.comp[0].hsf == 2 &&
	    c.comp[1].hsf == 1 && c.comp[1].vsf == 1 &&
	    c.comp[2].hsf == 1 && c.comp[2].vsf == 1) {
		if (c.comp[0].vsf == 2)
			return (new JpegDecoder_411(c));
		if (c.comp[0].vsf == 1)
			return (new JpegDecoder_422(c));
	}
	return (0);
}

int quality_to_qfactor(int v)
{
	if (v < 1)
		v = 5000;
	else if (v < 50)
		v = 5000 / v;
	else if (v < 100)
		v = 200 - v * 2;
	else
		v = 1;

	return (v);
}

/*
 * Tables from IJPG software
 */
static const int std_luminance_quant_tbl[64] = {
  16,  11,  12,  14,  12,  10,  16,  14, 13,  14,  18,  17,  16,  19,  24,  40,
  26,  24,  22,  22,  24,  49,  35,  37, 29,  40,  58,  51,  61,  60,  57,  51,
  56,  55,  64,  72,  92,  78,  64,  68, 87,  69,  55,  56,  80, 109,  81,  87,
  95,  98, 103, 104, 103,  62,  77, 113, 121, 112, 100, 120,  92, 101, 103,  99
};

static const int std_chrominance_quant_tbl[64] = {
  17,  18,  18,  24,  21,  24,  47,  26, 26,  47,  99,  66,  56,  66,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99, 99,  99,  99,  99,  99,  99,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99, 99,  99,  99,  99,  99,  99,  99,  99,
  99,  99,  99,  99,  99,  99,  99,  99, 99,  99,  99,  99,  99,  99,  99,  99
};

static const unsigned char dc_luminance_bits[17] =
    { /* 0-base */ 0, 0, 1, 5, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0 };
static const unsigned char dc_luminance_val[] =
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };

static const unsigned char dc_chrominance_bits[17] =
    { /* 0-base */ 0, 0, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0 };
static const unsigned char dc_chrominance_val[] =
    { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11 };

static const unsigned char ac_luminance_bits[17] =
    { /* 0-base */ 0, 0, 2, 1, 3, 3, 2, 4, 3, 5, 5, 4, 4, 0, 0, 1, 0x7d };
static const unsigned char ac_luminance_val[] =
    { 0x01, 0x02, 0x03, 0x00, 0x04, 0x11, 0x05, 0x12,
      0x21, 0x31, 0x41, 0x06, 0x13, 0x51, 0x61, 0x07,
      0x22, 0x71, 0x14, 0x32, 0x81, 0x91, 0xa1, 0x08,
      0x23, 0x42, 0xb1, 0xc1, 0x15, 0x52, 0xd1, 0xf0,
      0x24, 0x33, 0x62, 0x72, 0x82, 0x09, 0x0a, 0x16,
      0x17, 0x18, 0x19, 0x1a, 0x25, 0x26, 0x27, 0x28,
      0x29, 0x2a, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39,
      0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49,
      0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59,
      0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69,
      0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79,
      0x7a, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89,
      0x8a, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98,
      0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7,
      0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6,
      0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3, 0xc4, 0xc5,
      0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2, 0xd3, 0xd4,
      0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xe1, 0xe2,
      0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea,
      0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
      0xf9, 0xfa };

static const unsigned char ac_chrominance_bits[17] =
    { /* 0-base */ 0, 0, 2, 1, 2, 4, 4, 3, 4, 7, 5, 4, 4, 0, 1, 2, 0x77 };
static const unsigned char ac_chrominance_val[] =
    { 0x00, 0x01, 0x02, 0x03, 0x11, 0x04, 0x05, 0x21,
      0x31, 0x06, 0x12, 0x41, 0x51, 0x07, 0x61, 0x71,
      0x13, 0x22, 0x32, 0x81, 0x08, 0x14, 0x42, 0x91,
      0xa1, 0xb1, 0xc1, 0x09, 0x23, 0x33, 0x52, 0xf0,
      0x15, 0x62, 0x72, 0xd1, 0x0a, 0x16, 0x24, 0x34,
      0xe1, 0x25, 0xf1, 0x17, 0x18, 0x19, 0x1a, 0x26,
      0x27, 0x28, 0x29, 0x2a, 0x35, 0x36, 0x37, 0x38,
      0x39, 0x3a, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48,
      0x49, 0x4a, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58,
      0x59, 0x5a, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
      0x69, 0x6a, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78,
      0x79, 0x7a, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87,
      0x88, 0x89, 0x8a, 0x92, 0x93, 0x94, 0x95, 0x96,
      0x97, 0x98, 0x99, 0x9a, 0xa2, 0xa3, 0xa4, 0xa5,
      0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xb2, 0xb3, 0xb4,
      0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xc2, 0xc3,
      0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xd2,
      0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda,
      0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9,
      0xea, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8,
      0xf9, 0xfa };

void JpegDecoder::defaults(JpegDecoder::config& c)
{
	c.width = 0;
	c.height = 0;

	c.precision = 8;
	c.ncomp = 3;
	/* Y */
	c.comp[0].id = 0;
	c.comp[0].hsf = 2;
	c.comp[0].vsf = 1;
	c.comp[0].qno = 0;
	/* U */
	c.comp[1].id = 1;
	c.comp[1].hsf = 1;
	c.comp[1].vsf = 1;
	c.comp[1].qno = 1;
	/* V */
	c.comp[2].id = 2;
	c.comp[2].hsf = 1;
	c.comp[2].vsf = 1;
	c.comp[2].qno = 1;

	c.dc_huffbits[0] = dc_luminance_bits;
	c.dc_huffval[0] = dc_luminance_val;
	c.dc_huffbits[1] = dc_chrominance_bits;
	c.dc_huffval[1] = dc_chrominance_val;
	c.ac_huffbits[0] = ac_luminance_bits;
	c.ac_huffval[0] = ac_luminance_val;
	c.ac_huffbits[1] = ac_chrominance_bits;
	c.ac_huffval[1] = ac_chrominance_val;
	for (int i = 2; i < 4; ++i) {
		c.ac_huffbits[i] = 0;
		c.ac_huffval[i] = 0;
		c.dc_huffbits[i] = 0;
		c.dc_huffval[i] = 0;
	}
	c.comp[0].dc_tbl_no = 0;
	c.comp[0].ac_tbl_no = 0;
	c.comp[1].dc_tbl_no = 1;
	c.comp[1].ac_tbl_no = 1;
	c.comp[2].dc_tbl_no = 1;
	c.comp[2].ac_tbl_no = 1;
}

/*
 * Set the quantizer for this configuration.  Q is the IJPG quality
 * factor, which has a value in [0,100].
 */
void JpegDecoder::quantizer(JpegDecoder::config& c, int q)
{
	q = quality_to_qfactor(q);
	for (int i = 0; i < 64; i++) {
		int val = (q * std_luminance_quant_tbl[i] + 50) / 100;
		if (val < 0)
			val = 1;
		if (val > 32768)
			val = 32768;
		c.qtab[0][i] = val;
	}
	for (i = 0; i < 64; i++) {
		int val = (q * std_chrominance_quant_tbl[i] + 50) / 100;
		if (val < 0)
			val = 1;
		if (val > 32768)
			val = 32768;
		c.qtab[1][i] = val;
	}
}

JpegDecoder::JpegDecoder(const config& c) : color_(1),
	width_(-1), height_(-1), decimation_(0),
	cache_(0), thresh_(0)
{
	for (int i = NUM_HUFF_TBLS; --i >= 0; ) {
		dcht_[i] = 0;
		acht_[i] = 0;
	}
	bzero((char*)comp_, sizeof(comp_));
	init(c);
}

JpegDecoder::~JpegDecoder()
{
	freehufftab();
	delete comp_[0].frm;
}

void
JpegDecoder::fold_idct_q(const short *in, int *out)
{
	for (int i = 0; i < 64; ++i) {
		int v = in[i];
		int k = ZAG[i];
		int s, c;
		switch (k & 7) {
		default:
			c = v * FIX(1);
			s = 0;
			break;
		case 1:
			c = v * FIX(C_1 / C4);
			s = v * FIX(S_1 / C4);
			break;
		case 2:
			c = v * FIX(C6 / C4);
			s = v * FIX(S6 / C4);
			break;
		case 3:
			c = v * FIX(C_3 / C4);
			s = v * FIX(S_3 / C4);
			break;
		case 5:
			c = v * FIX(C_3 / C4);
			s = v * -FIX(S_3 / C4);
			break;
		case 6:
			c = v * FIX(C6 / C4);
			s = v * -FIX(S6 / C4);
			break;
		case 7:
			c = v * FIX(C_1 / C4);
			s = v * -FIX(S_1 / C4);
			break;
		}
		/* row to col ordering */
#ifdef notdef
		k = (k & 7) << 3 | k >> 3;
#endif
		k <<= 1;
		out[k] = c;
		out[k + 1] = s;
	}
}

void JpegDecoder::init(const config& c)
{
	rlen_ = 0;
	for (int i = 0; i < 4; ++i)
		fold_idct_q(c.qtab[i], qt_[i]);

	ncomp_ = c.ncomp;/*FIXME*/
	if (width_ != c.width || height_ != c.height) {
		width_ = c.width;
		height_ = c.height;

		/*
		 * Allocate the frame store
		 */
		delete comp_[0].frm;
		int size = size_ = width_ * height_;

		/*FIXME this allocates more than we need for 4:1:1*/
		/*FIXME compute sizes based on decimation factors*/
		u_char* p = new u_char[2 * size];
		/* Initialize to gray */
		memset((char*)p, 0x80, 2 * size);

		comp_[0].frm = p;
		p += size;
		comp_[1].frm = p;
		p += size / 2;
		comp_[2].frm = p;
	}
	for (i = ncomp_; --i >= 0; ) {
		int id = c.comp[i].id;
		comp_[id].hsf = c.comp[i].hsf;
		comp_[id].vsf = c.comp[i].vsf;
		comp_[id].qno = c.comp[i].qno;
		comp_[id].dc_tbl_no = c.comp[i].dc_tbl_no;
		comp_[id].ac_tbl_no = c.comp[i].ac_tbl_no;
	}
	/*
	 * FIXME should check if huffman table won't change
	 * before reallocating.
	 */
	freehufftab();
	for (i = 0; i < 4; ++i) {
		if (c.dc_huffval[i] != 0) {
			int id = c.comp[i].id;
			dcht_[id] = huffbuild(c.dc_huffbits[i],
					      c.dc_huffval[i]);
		}
		if (c.ac_huffval[i] != 0) {
			int id = c.comp[i].id;
			acht_[id] = huffbuild(c.ac_huffbits[i],
					      c.ac_huffval[i]);
		}
	}

	int maxh = 1;
	int maxv = 1;
	for (i = ncomp_; --i >= 0; ) {
		if (maxh < comp_[i].hsf)
			maxh = comp_[i].hsf;
		if (maxv < comp_[i].vsf)
			maxh = comp_[i].vsf;
	}
	ncol_ = (width_ + 8 * maxh - 1) / (8 * maxh);
	nrow_ = (height_ + 8 * maxv - 1) / (8 * maxv);

	int nmcu = ncol_ * nrow_;
	int nblk = nmcu * comp_[0].hsf * comp_[0].vsf;
	nblk += nmcu * comp_[1].hsf * comp_[1].vsf;
	nblk += nmcu * comp_[2].hsf * comp_[2].vsf;
#define NCC 6
	int ns = NCC * nblk;
	delete cache_;
	cache_ = new short[ns];
	bzero((char*)cache_, ns * sizeof(*cache_));
}

void JpegDecoder::freehufftab()
{
	for (int i = 0; i < 4; ++i) {
		if (dcht_[i] != 0)
			free(dcht_[i]);
		if (acht_[i] != 0)
			free(acht_[i]);
	}
}

void JpegDecoder::fill(u_int dc, u_char* out, int stride) const
{
	int t;
	dc = UCLIMIT(dc, t) & 0xff;
	dc |= dc << 8;
	dc |= dc << 16;
#ifdef INT_64
	INT_64 xdc = dc;
	xdc |= xdc << 32;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
	out += stride;
	*(INT_64 *)out = xdc;
#else
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
	out += stride;
	*(u_word*)out = dc;
	*(u_word*)(out + 4) = dc;
#endif
}

#if NCC != 6

@BUG in blkdiff@
#endif
#define ABS(t) ((t) - (((t) >> 31 & (t)) << 1))
inline int blkdiff(short* blk, short* cache)
{
	int t = blk[0] - cache[0];
	int d = ABS(t);
	t = blk[1] - cache[1];
	d += ABS(t);
	t = blk[2] - cache[2];
	d += ABS(t);
	t = blk[3] - cache[3];
	d += ABS(t);
	t = blk[4] - cache[4];
	d += ABS(t);
	t = blk[5] - cache[5];
	d += ABS(t);
	return (d);
}

#ifdef notdef
/* FIXME this is negligibly faster than above, and has a bug */
inline int blkdiff(short* blk, short* cache)
{
	u_int* p0 = (u_int*)blk;
	u_int* p1 = (u_int*)cache;

	int m = ~0x80008000;
	int t = (p0[0] >> 1) & m;
	t += (~p1[0] >> 1) & m;
	int v = t << 17 >> 17;
	int d = ABS(v);
	t = (t << 1) >> 17;
	d += ABS(v);

	t = (p0[1] >> 1) & m;
	t += (~p1[1] >> 1) & m;
	v = t << 17 >> 17;
	d += ABS(v);
	t = (t << 1) >> 17;
	d += ABS(v);

	t = (p0[2] >> 1) & m;
	t += (~p1[2] >> 1) & m;
	v = t << 17 >> 17;
	d += ABS(v);
	t = (t << 1) >> 17;
	d += ABS(v);

	return (d);
}
#endif

int JpegDecoder::rdqt(const u_char* p)
{
#ifdef notyet
	int len = *p++ << 8;
	len |= *p++;

	const u_char* ep = p + len - 2;
	while (p < ep) {
		int n = *p++;
		int prec = n >> 4;
		n &= 0x0f;
		if (n >= 4) {
			/*FIXME illegal number*/
			return (-1);
		}
		short* qt = qt_[n];
		for (int i = 0; i < 64; i++) {
			int v = *p++;
			if (prec)
				v = (v << 8) + *p++;
			qt[i] = v;
		}
	}
	return (len);
#else
	abort();
	return (0);
#endif
}

void JpegDecoder::restart()
{
	int c;
	nbb_ = 0;
	/*FIXMEwhat if ff is sitting in bit buffer?*/
	/* Scan for next JPEG marker */
	do {
		do {			/* skip any non-FF bytes */
			c = *inb_++;
		} while (c != 0xFF);
		do {
			/* skip any duplicate FFs */
			/* we don't increment nbytes here since extra FFs are legal */
			c = *inb_++;
		} while (c == 0xFF);
	} while (c == 0);		/* repeat if it was a stuffed FF/00 */
#ifdef notdef
	if (nbytes != 1)
		WARNMS2(cinfo->emethods,
			"Corrupt JPEG data: %d extraneous bytes before marker 0x%02x",
			nbytes-1, c);

#endif
#ifdef notdef
	if (c != (RST0 + cinfo->next_restart_num)) {
		/* Uh-oh, the restart markers have been messed up too. */
		/* Let the file-format module try to figure out how to resync. */
		(*cinfo->methods->resync_to_restart) (cinfo, c);
	} else
		TRACEMS1(cinfo->emethods, 2, "RST%d", cinfo->next_restart_num);
#endif
	/* Re-initialize DC predictions to 0 */
	comp_[0].dc = 0;
	comp_[1].dc = 0;
	comp_[2].dc = 0;
#ifdef notdef
	cinfo->next_restart_num = (cinfo->next_restart_num + 1) & 7;
#endif
}

u_char* JpegDecoder::parseJFIF(u_char* in)
{
	int t;
	while (in < end_) {
		if (*in++ != 0xff)
			continue;
		/*FIXME need more checks for buffer overflow*/
		switch (*in++) {

		default:
			/* Don't know.  Keep looking for SOS. */
			continue;

		case 0xdb:
			/* quantization table */
			t = rdqt(in);
			if (t < 0)
				/*FIXME*/
				return (end_);
			in += t;
			continue;

		case 0xdd:
			/* restart interval definition */
			t = *in++ << 8;
			t |= *in++;
			if (t != 4)
				/* FIXME bad length */
				;
			rlen_ = *in++ << 8;
			rlen_ |= *in++;
			rcnt_ = 0;
			continue;

		case 0xda:
			/* start-of-scan marker */
			if (in + 2 <= end_) {
				/* skip over SOS */
				int t = *in++ << 8;
				t |= *in++;
				in += (t - 2);
			}
			return (in);
		}
	}
	/*FIXME*/
	return (end_);
}

int JpegDecoder_422::decode(u_char* in, int len)
{
	minx_ = ncol_;
	miny_ = nrow_;
	maxx_ = 0;
	maxy_ = 0;

	inb_ = in;
	end_ = in + len;
	nbb_ = 0;
	/*
	 * If first symbol is a marker (a not a stuffed ff),
	 * assume a jfif header is present and parse it.
	 * FIXME this could change state that needs to be
	 * communicated back to caller.
	 */
	if (in[0] == 0xff && in[1] != 0)
		inb_ = parseJFIF(inb_);

	u_char* p0 = comp_[0].frm;
	u_char* p1 = comp_[1].frm;
	huffreset();
	short* cache = cache_;
	u_long ablk[sizeof(short) * 64 / sizeof(u_long)];
	short* blk = (short*)ablk;

	const int* qt0 = qt_[comp_[0].qno];
	const int* qt1 = qt_[comp_[1].qno];
	for (int y = 0; y < nrow_; ++y) {
		for (int x = 0; x < ncol_; ++x) {
			/* FIXME use INT_64 */
			u_int mask[2];
			/*
			 * If we're handling restart markers,
			 * check if we need to resync.
			 */
			if (rlen_ != 0 && --rcnt_ <= 0) {
				rcnt_ = rlen_;
				restart();
			}

			int nc = huffparse(comp_[0], blk, cache, mask);
			int dontskip = nc;
			cache += NCC;
			if (nc != 0)
				v_rdct(blk, mask, p0, width_, qt0);
			nc = huffparse(comp_[0], blk, cache, mask);
			dontskip |= nc;
			cache += NCC;
			if (nc != 0)
				v_rdct(blk, mask, p0 + 8, width_, qt0);
			p0 += 16;
			if (color_) {
				/*
				 * If we found above that the luminance
				 * planes exceeded the threhold, decode
				 * the choma planes unconditionally.
				 * Otherwise, see if they can be
				 * suppressed too.
				 */
				if (dontskip) {
					huffparsef(comp_[1], blk, cache, mask);
					cache += NCC;
					v_rdct(blk, mask, p1, width_ / 2, qt1);
					huffparsef(comp_[2], blk, cache, mask);
					cache += NCC;
					v_rdct(blk, mask, p1 + size_ / 2,
					       width_ / 2, qt1);
				} else {
					nc = huffparse(comp_[1], blk,
						       cache, mask);
					cache += NCC;
					if (nc != 0)
						v_rdct(blk, mask, p1,
						       width_ / 2, qt1);
					dontskip |= nc;
					nc = huffparse(comp_[2], blk,
						       cache, mask);
					cache += NCC;
					if (nc != 0)
						v_rdct(blk, mask,
						       p1 + size_ / 2,
						       width_ / 2, qt1);
					dontskip |= nc;
				}
				p1 += 8;
			} else {
				(void)huffskip(comp_[1]);
				(void)huffskip(comp_[2]);
				cache += 2 * NCC;
			}
			/* Update bounding box */
			/* FIXME these can be locals */
			if (dontskip) {
				if (x < minx_)
					minx_ = x;
				if (x > maxx_)
					maxx_ = x;
				if (y < miny_)
					miny_ = y;
				if (y > maxy_)
					maxy_ = y;
			}
		}
		/*
		 * We're at the end of the current line.
		 * Back up to the beggining, then skip down
		 * one row to the start of the next mcu.
		 */
		p0 -= width_;
		p0 += 8 * width_;
		p1 -= width_ / 2;
		p1 += 8 * width_ / 2;
	}
	/*
	 * Convert mcu coords to pixel coords.
	 * A 4:2:2 mcu is 16x8.
	 */
	minx_ <<= 4;
	maxx_ <<= 4;
	miny_ <<= 3;
	maxy_ <<= 3;
	maxx_ += 16;
	maxy_ += 8;

	return (0);
}

int JpegDecoder_411::decode(u_char* in, int len)
{
#ifdef notdef
	minx_ = ncol_;
	miny_ = nrow_;
	maxx_ = 0;
	maxy_ = 0;

	inb_ = in;
	end_ = in + len;
	nbb_ = 0;
	/*
	 * FIXME break this into a separate routine that gets called
	 * only when this crap is present
	 */
	/*
	 * If first symbol is a marker (a not a stuffed ff),
	 * assume a jfif header is present and parse it.
	 * FIXME this could change state that needs to be
	 * communicated back to caller.
	 */
	if (in[0] == 0xff && in[1] != 0)
		inb_ = parseJFIF(inb_);

	u_char* p0 = comp_[0].frm;
	u_char* p1 = comp_[1].frm;
	huffreset();
	short* cache = cache_;
	u_long ablk[sizeof(short) * 64 / sizeof(u_long)];
	short* blk = (short*)ablk;

	for (int y = nrow_ / 2; --y >= 0; ) {
		for (int x = ncol_; --x >= 0; ) {
			if (rlen_ != 0 && --rcnt_ <= 0) {
				rcnt_ = rlen_;
				restart();
			}
			int skip = huffparse(comp_[0], blk, cache);
			cache += NCC;
			if (!skip)
				blit(blk, p0, width_, qt_);
			int allskip = skip;
			skip = huffparse(comp_[0], blk, cache);
			allskip &= skip;
			cache += NCC;
			if (!skip)
				blit(blk, p0 + 8, width_, qt_);
			skip = huffparse(comp_[0], blk, cache);
			allskip &= skip;
			cache += NCC;
			if (!skip)
				blit(blk, p0 + width_, width_);
			skip = huffparse(comp_[0], blk, cache);
			allskip &= skip;
			cache += NCC;
			if (!skip)
				blit(blk, p0 + width_ + 8, width_);

			p0 += 16;
			if (color_) {
				skip = huffparse(comp_[1], blk, cache);
				cache += NCC;
				allskip &= skip;
				if (!skip)
					blit(blk, p1, width_ / 2);
				skip = huffparse(comp_[2], blk, cache);
				cache += NCC;
				allskip &= skip;
				if (!skip)
					blit(blk, p1 + width_ / 2, width_ / 2);
				p1 += 8;
			} else {
				huffskip(comp_[1]);
				huffskip(comp_[2]);
			}
			/* Update bounding box */
			/* FIXME these can be locals */
			if (!allskip) {
				if (x < minx_)
					minx_ = x;
				if (x > maxx_)
					maxx_ = x;
				if (y < miny_)
					miny_ = y;
				if (y > maxy_)
					maxy_ = y;
			}
		}
		p0 -= width_;
		p0 += 16 * width_;
		p1 -= width_ / 2;
		p1 += 8 * width_ / 2;
	}
	/*
	 * Convert mcu coords to pixel coords.
	 * A 4:1:1 mcu is 16x16.
	 */
	minx_ <<= 4;
	maxx_ <<= 4;
	miny_ <<= 4;
	maxy_ <<= 4;
	maxx_ += 16;
	maxy_ += 16;
#endif

	return (0);
}

/* Figure F.12: extend sign bit */

#ifdef notdef
#define huff_EXTEND(x,s)  ((x) < extend_test[s] ? (x) + extend_offset[s] : (x))

static const int extend_test[16] =   /* entry n is 2**(n-1) */
  { 0, 0x0001, 0x0002, 0x0004, 0x0008, 0x0010, 0x0020, 0x0040, 0x0080,
    0x0100, 0x0200, 0x0400, 0x0800, 0x1000, 0x2000, 0x4000 };

static const int extend_offset[16] = /* entry n is (-1 << n) + 1 */
  { 0, ((-1)<<1) + 1, ((-1)<<2) + 1, ((-1)<<3) + 1, ((-1)<<4) + 1,
    ((-1)<<5) + 1, ((-1)<<6) + 1, ((-1)<<7) + 1, ((-1)<<8) + 1,
    ((-1)<<9) + 1, ((-1)<<10) + 1, ((-1)<<11) + 1, ((-1)<<12) + 1,
    ((-1)<<13) + 1, ((-1)<<14) + 1, ((-1)<<15) + 1 };
#else
/* is this really faster? */
inline int
huff_EXTEND(int x, int s)
{
	register int b = x >> (s - 1);
	register int m = ((b & 1) - 1) << s;
	return ((x | m) + (~b & 1));
}
#endif

/*
 * Read the next 16 bits off the bit string into the bit buffer.
 * Skip over zero-stuffed ff's but make no attempt to verify
 * that they aren't some other marker (which shouldn't be in the
 * middle of a block anyway).
 */
#define HUFFRQ(bb) \
 { \
	register int v; \
	register u_char *cp = inb_; \
 \
	bb <<= 16; \
	v = *cp++; \
	if (v == 0xff) ++cp; \
	bb |= v << 8; \
	v = *cp++; \
	if (v == 0xff) ++cp; \
	bb |= v; \
	inb_ = cp; \
 \
}

#define MASK(s) ((1 << (s)) - 1)

#define HUFF_DECODE(ht, nbb, bb, result) { \
	register int s_, v_; \
 \
	if (nbb < 16) { \
		HUFFRQ(bb); \
		nbb += 16; \
	} \
	v_ = (bb >> (nbb - 16)) & 0xffff; \
	s_ = (ht)[v_]; \
	nbb -= (s_ >> 8); \
	result = s_ & 0xff; \
 }

#define GET_BITS(n, nbb, bb, result) \
{ \
	nbb -= n; \
	if (nbb < 0)  { \
		HUFFRQ(bb); \
		nbb += 16; \
	} \
	(result) = ((bb >> nbb) & MASK(n)); \
}

#define SKIP_BITS(n, nbb, bb) \
{ \
	nbb -= n; \
	if (nbb < 0)  { \
		HUFFRQ(bb); \
		nbb += 16; \
	} \
}

int JpegDecoder::huffdc(component& p)
{
	/* Decode a single block's worth of coefficients */

	/* Section F.2.2.1: decode the DC coefficient difference */
	register int bb = bb_;
	register int nbb = nbb_;
	u_short* ht = dcht_[p.dc_tbl_no];
	register int s, r;
	HUFF_DECODE(ht, nbb, bb, s);
	if (s != 0) {
		GET_BITS(s, nbb, bb, r);
		s = huff_EXTEND(r, s);
	}
	/* Convert DC difference to actual value, update predictor */
	s += p.dc;
	p.dc = s;

	/* Section F.2.2.2: decode the AC coefficients */
	ht = acht_[p.ac_tbl_no];
	for (register int k = 1; k < 64; ) {
		/* Symbol-1 */
		register int v;
		HUFF_DECODE(ht, nbb, bb, v);
		s = v & 15;
		r = v >> 4;
		if (s != 0) {
			k += r;
			/* Symbol-2 */
			SKIP_BITS(s, nbb, bb);
			++k;
		} else {
			if (r != 15)
				/* end of block */
				break;
			k += 16;
		}
	}
	nbb_ = nbb;
	bb_ = bb;

	return (0);
}

/*
 * Parse a huffman-encoded 8x8 block.  Blocks are independent
 * of eachother, except for the dc predictor, and blocks can
 * start and end on arbitrary bit boundaries.  No markers should
 * appear in the bit stream, and ff bytes should be zero stuffed
 * (i.e., replaced with ff 00).
 *
 * The block is coded as a sequence of pairs of symbols.  Where the
 * first symbol is a huffman-encoded value <r,n> where r is a four-bit
 * runlength and n is the length of the second symbol (of the pair),
 * which follows verbatim in the bit string.
 */
int JpegDecoder::huffparse(component& p, short* blk, short* cache,
			   u_int* mask)
{
	register int bb = bb_;
	register int nbb = nbb_;
	u_short* ht = dcht_[p.dc_tbl_no];
	register int s, r;
	HUFF_DECODE(ht, nbb, bb, s);
	if (s != 0) {
		GET_BITS(s, nbb, bb, r);
		s = huff_EXTEND(r, s);
	}

	/* update predictor */
	s += p.dc;
	p.dc = s;
	blk[0] = s;

#if NCC != 6
@BUG@
#endif
	u_int sblk[3];
#if BYTE_ORDER == LITTLE_ENDIAN
	sblk[0] = s;
#else
	sblk[0] = s << 16;
#endif
	sblk[1] = 0;
	sblk[2] = 0;

	/*
	 * First, grab only a few low frequency coefficients.
	 * If they aren't sufficiently different from the current
	 * block, skip over this block quickly.
	 */
	ht = acht_[p.ac_tbl_no];
	register int k = 1;
	int m0 = 0;
	for (;;) {
		register int v;
		/* Symbol-1 */
		HUFF_DECODE(ht, nbb, bb, v);
		s = v & 15;
		r = v >> 4;
		if (s != 0) {
			k += r;
			if (k >= 6) {
				k -= r;
				break;
			}
			/* Symbol-2 */
			GET_BITS(s, nbb, bb, v);
			s = huff_EXTEND(v, s);
			((short*)sblk)[k] = s;/*FIXME*/
			v = ZAG[k];
			m0 |= 1 << v;
			blk[v] = s;
			++k;
		} else
			break;
	}
	if (blkdiff((short*)sblk, cache) < thresh_) {
		/* skip this block */
		for (;;) {
			if (s != 0) {
				k += r;
				/* Symbol-2 */
				SKIP_BITS(s, nbb, bb);
				++k;
			} else {
				if (r != 15)
				/* end of block */
					break;
				k += 16;
			}
			if (k >= 64)
				break;

			/* Symbol-1 */
			register int v;
			HUFF_DECODE(ht, nbb, bb, v);
			s = v & 15;
			r = v >> 4;
		}
		nbb_ = nbb;
		bb_ = bb;

		return (0);
	}

#if NCC != 6
@BUG@
#endif
	((u_int*)cache)[0] = sblk[0];
	((u_int*)cache)[1] = sblk[1];
	((u_int*)cache)[2] = sblk[2];

	int m1 = 0;
	for (;;) {
		register int v;
		if (s != 0) {
			k += r;
			/* Symbol-2 */
			GET_BITS(s, nbb, bb, v);
			s = huff_EXTEND(v, s);
			v = ZAG[k];
			blk[v] = s;

			/*
			 * This sets bit "v" if v < 32, otherwise
			 * it sets bit 0, but this is okay since
			 * we always set blk[0] (which is okay
			 * because the dc is rarely 0).
			 */
			m0 |= 1 << (v & ((v-32) >> 31));
			/*
			 * If v >= 32, this sets bit 64-v in m1.
			 * Otherwise, it does nothing.
			 */
			v -= 32;
			m1 |= (~v >> 31 & 1) << v;

			++k;
		} else {
			if (r != 15)
				/* end of block */
				break;

			k += 16;
		}
		if (k >= 64)
			break;

		/* Symbol-1 */
		HUFF_DECODE(ht, nbb, bb, v);
		s = v & 15;
		r = v >> 4;
	}
	mask[0] = m0;
	mask[1] = m1;
	nbb_ = nbb;
	bb_ = bb;

	return (1);
}

void JpegDecoder::huffparsef(component& p, short* blk, short* cache,
			    u_int* mask)
{
	register int bb = bb_;
	register int nbb = nbb_;
	u_short* ht = dcht_[p.dc_tbl_no];
	register int s, r;
	HUFF_DECODE(ht, nbb, bb, s);
	if (s != 0) {
		GET_BITS(s, nbb, bb, r);
		s = huff_EXTEND(r, s);
	}

	/* update predictor */
	s += p.dc;
	p.dc = s;
	blk[0] = s;

	/*
	 * First, grab only a few low frequency coefficients.
	 * If they aren't sufficiently different from the current
	 * block, skip over this block quickly.
	 */
	ht = acht_[p.ac_tbl_no];
	register int k = 1;
	int m0 = 0;
	int m1 = 0;
	do {
		/* Symbol-1 */
		register int v;
		HUFF_DECODE(ht, nbb, bb, v);
		s = v & 15;
		r = v >> 4;

		if (s != 0) {
			k += r;
			/* Symbol-2 */
			GET_BITS(s, nbb, bb, v);
			s = huff_EXTEND(v, s);
			v = ZAG[k];
			blk[v] = s;

			/*
			 * This sets bit "v" if v < 32, otherwise
			 * it sets bit 0, but this is okay since
			 * we always set blk[0] (which is okay
			 * because the dc is rarely 0).
			 */
			m0 |= 1 << (v & ((v-32) >> 31));
			/*
			 * If v >= 32, this sets bit 64-v in m1.
			 * Otherwise, it does nothing.
			 */
			v -= 32;
			m1 |= (~v >> 31 & 1) << v;

			++k;
		} else {
			if (r != 15)
				/* end of block */
				break;

			k += 16;
		}
	} while (k < 64);

	mask[0] = m0;
	mask[1] = m1;
	nbb_ = nbb;
	bb_ = bb;

	((u_int*)cache)[0] = ((u_int*)blk)[0];
	((u_int*)cache)[1] = ((u_int*)blk)[1];
	((u_int*)cache)[2] = ((u_int*)blk)[2];
}

/*
 * Skip over a block.  Don't even update the dc predictor.
 */
int JpegDecoder::huffskip(component& p)
{
	register int bb = bb_;
	register int nbb = nbb_;
	u_short* ht = dcht_[p.dc_tbl_no];
	register int s;
	HUFF_DECODE(ht, nbb, bb, s);
	if (s != 0) {
		SKIP_BITS(s, nbb, bb);
	}
	ht = acht_[p.ac_tbl_no];
	for (register int k = 1; k < 64; ) {
		/* Symbol-1 */
		register int v;
		HUFF_DECODE(ht, nbb, bb, v);
		s = v & 15;
		register int r = v >> 4;
		if (s != 0) {
			k += r;
			/* Symbol-2 */
			SKIP_BITS(s, nbb, bb);
			++k;
		} else {
			if (r != 15)
				/* end of block */
				break;
			k += 16;
		}
	}
	nbb_ = nbb;
	bb_ = bb;

	return (0);
}

void JpegDecoder::huffreset()
{
	nbb_ = 0;
	comp_[0].dc = 0;
	comp_[1].dc = 0;
	comp_[2].dc = 0;
}

/*
 * Build a 64k lookup table from the jpeg huffman table described
 * by the arguments.  The table is indexed by the next 16-bits
 * of the input stream.  The entry in the table tells us the
 * length (and value) of the next symbol.  (Of course, the value
 * is redundant with the input bits).
 */
u_short* JpegDecoder::huffbuild(const u_char* bits, const u_char* vals) const
{
	/* Figure C.1: make table of Huffman code length for each symbol */
	/* Note that this is in code-length order. */

	int nsym = 0;
	int huffsize[257];
	for (int codelen = 1; codelen <= 16; ++codelen) {
		for (int i = 1; i <= bits[codelen]; ++i)
			/*
			 * FIXME should sanity check that nsym stays
			 * below 256.
			 */
			huffsize[nsym++] = codelen;
	}
	huffsize[nsym] = 0;

	/* Figure C.2: generate the codes themselves */
	/* Note that this is in code-length order. */

	int code = 0;
	int si = huffsize[0];
	u_short huffcode[256];
	int p = 0;
	while (p < nsym) {
		while (huffsize[p] == si)
			huffcode[p++] = code++;

		code <<= 1;
		++si;
	}
	/*
	 * Build the direct-map lookup table.
	 */
	u_short *ht = new u_short[65536];
	bzero((char*)ht, 65536 * sizeof(u_short));
	for (int sym = 0; sym < nsym; ++sym) {
		int codelen = huffsize[sym];
		int nbit = 16 - codelen;
		int code = huffcode[sym] << nbit;
		int map = (codelen << 8) | vals[sym];
		/*
		 * The low nbit bits are don't cares.
		 * Spin through all possible combos.
		 */
		for (int n = 1 << nbit; --n >= 0; )
			ht[code | n] = map;
	}
	return (ht);
}

void
xidct(register short* bp, u_int* mask, u_char* p, int stride)
{
  /* Pass 1: process rows. */
  /* Note results are scaled up by sqrt(8) compared to a true IDCT; */
  /* furthermore, we scale the results by 2**PASS1_BITS. */

  u_int m0 = mask[0];
  u_int m1 = mask[1];

/*FIXME*/
  if (*bp)
	  m0 |= 1;

  for (int rowctr = 0; rowctr < 8; ++rowctr) {
    int tmp0, tmp1, tmp2, tmp3;
    int tmp10, tmp11, tmp12, tmp13;
    int z1, z2, z3, z4, z5;

    /*
     * Due to quantization, we will usually find that many of the input
     * coefficients are zero, especially the AC terms.  We can exploit this
     * by short-circuiting the IDCT calculation for any row in which all
     * the AC terms are zero.  In that case each output is equal to the
     * DC coefficient (with scale factor as needed).
     * With typical images and quantization tables, half or more of the
     * row DCT calculations can be simplified this way.
     */
    if ((m0 & 0xfe) == 0) {
      /* AC terms all zero */
      int v;
     if (m0 & 1) {
	  v = (bp[0] << PASS1_BITS) & 0xffff;
	  v |= v << 16;
      } else
	  v = 0;
      ((u_int*)bp)[0] = v;
      ((u_int*)bp)[1] = v;
      ((u_int*)bp)[2] = v;
      ((u_int*)bp)[3] = v;
      goto nextrow;
    }

    /* Even part: reverse the even part of the forward DCT. */
    /* The rotator is sqrt(2)*c(-6). */
    if (m0 & 1 << 6) {
	int d6 = bp[6];
	if (m0 & 1 << 4) {
	    int d4 = bp[4];
	    if (m0 & 1 << 2) {
	        int d2 = bp[2];
		if (m0 & 1) {
		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
		    int d0 = bp[0];
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    } else {
		if (m0 & 1) {
		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
		    int d0 = bp[0];
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, -FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    }
	} else {
	    if (m0 & 1 << 2) {
	        int d2 = bp[2];
		if (m0 & 1) {
		    /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
		    int d0 = bp[0];
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    } else {
		if (m0 & 1) {
		    /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
		    int d0 = bp[0];
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    }
	}
    } else {
	if (m0 & 1 << 4) {
	    int d4 = bp[4];
	    if (m0 & 1 << 2) {
		int d2 = bp[2];
		if (m0 & 1) {
		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
		    int d0 = bp[0];
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    } else {
		if (m0 & 1) {
		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
		    int d0 = bp[0];
		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
		} else {
		    /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
		    tmp10 = tmp13 = d4 << CONST_BITS;
		    tmp11 = tmp12 = -tmp10;
		}
	    }
	} else {
	    if (m0 & 1 << 2) {
		int d2 = bp[2];
		if (m0 & 1) {
		    /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
		    int d0 = bp[0];
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    } else {
		if (m0 & 1) {
		    /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
		    int d0 = bp[0];
		    tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
		} else {
		    /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
		    tmp10 = tmp13 = tmp11 = tmp12 = 0;
		}
	    }
	}
    }


    /* Odd part per figure 8; the matrix is unitary and hence its
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
     */

    if (m0 & 1 << 7) {
	int d7 = bp[7];
	if (m0 & 1 << 5) {
	    int d5 = bp[5];
	    if (m0 & 1 << 3) {
	        int d3 = bp[3];
		if (m0 & 1 << 1) {
		    int d1 = bp[1];
		    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
		    z1 = d7 + d1;
		    z2 = d5 + d3;
		    z3 = d7 + d3;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(z3 + z4, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
		    z1 = d7;
		    z2 = d5 + d3;
		    z3 = d7 + d3;
		    z5 = MULTIPLY(z3 + d5, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 = z1 + z4;
		}
	    } else {
		if (m0 & 1 << 1) {
		    int d1 = bp[1];
		    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
		    z1 = d7 + d1;
		    z2 = d5;
		    z3 = d7;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(z3 + z4, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 = z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
		    tmp0 = MULTIPLY(d7, - FIX(0.601344887));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    tmp1 = MULTIPLY(d5, - FIX(0.509795578));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));
		    z5 = MULTIPLY(d5 + d7, FIX(1.175875602));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z3;
		    tmp1 += z4;
		    tmp2 = z2 + z3;
		    tmp3 = z1 + z4;
		}
	    }
	} else {
	    if (m0 & 1 << 3) {
		int d3 = bp[3];
		if (m0 & 1 << 1) {
		    int d1 = bp[1];
		    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
		    z1 = d7 + d1;
		    z3 = d7 + d3;
		    z5 = MULTIPLY(z3 + d1, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(d3, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(d1, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 = z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
		    z3 = d7 + d3;

		    tmp0 = MULTIPLY(d7, - FIX(0.601344887));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    tmp2 = MULTIPLY(d3, FIX(0.509795579));
		    z2 = MULTIPLY(d3, - FIX(2.562915447));
		    z5 = MULTIPLY(z3, FIX(1.175875602));
		    z3 = MULTIPLY(z3, - FIX(0.785694958));

		    tmp0 += z3;
		    tmp1 = z2 + z5;
		    tmp2 += z3;
		    tmp3 = z1 + z5;
		}
	    } else {
		if (m0 & 1 << 1) {
		    int d1 = bp[1];
		    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
		    z1 = d7 + d1;
		    z5 = MULTIPLY(z1, FIX(1.175875602));

		    z1 = MULTIPLY(z1, FIX(0.275899379));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    tmp0 = MULTIPLY(d7, - FIX(1.662939224));
		    z4 = MULTIPLY(d1, - FIX(0.390180644));
		    tmp3 = MULTIPLY(d1, FIX(1.111140466));

		    tmp0 += z1;
		    tmp1 = z4 + z5;
		    tmp2 = z3 + z5;
		    tmp3 += z1;
		} else {
		    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
		    tmp0 = MULTIPLY(d7, - FIX(1.387039845));
		    tmp1 = MULTIPLY(d7, FIX(1.175875602));
		    tmp2 = MULTIPLY(d7, - FIX(0.785694958));
		    tmp3 = MULTIPLY(d7, FIX(0.275899379));
		}
	    }
	}
    } else {
	if (m0 & 1 << 5) {
	    int d5 = bp[5];
	    if (m0 & 1 << 3) {
		int d3 = bp[3];
		if (m0 & 1 << 1) {
		    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
		    int d1 = bp[1];
		    z2 = d5 + d3;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(d3 + z4, FIX(1.175875602));

		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(d1, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(d3, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 = z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
		    z2 = d5 + d3;

		    z5 = MULTIPLY(z2, FIX(1.175875602));
		    tmp1 = MULTIPLY(d5, FIX(1.662939225));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));
		    z2 = MULTIPLY(z2, - FIX(1.387039845));
		    tmp2 = MULTIPLY(d3, FIX(1.111140466));
		    z3 = MULTIPLY(d3, - FIX(1.961570560));

		    tmp0 = z3 + z5;
		    tmp1 += z2;
		    tmp2 += z2;
		    tmp3 = z4 + z5;
		}
	    } else {
		if (m0 & 1 << 1) {
		    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
		    int d1 = bp[1];
		    z4 = d5 + d1;

		    z5 = MULTIPLY(z4, FIX(1.175875602));
		    z1 = MULTIPLY(d1, - FIX(0.899976223));
		    tmp3 = MULTIPLY(d1, FIX(0.601344887));
		    tmp1 = MULTIPLY(d5, - FIX(0.509795578));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z4 = MULTIPLY(z4, FIX(0.785694958));

		    tmp0 = z1 + z5;
		    tmp1 += z4;
		    tmp2 = z2 + z5;
		    tmp3 += z4;
		} else {
		    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
		    tmp0 = MULTIPLY(d5, FIX(1.175875602));
		    tmp1 = MULTIPLY(d5, FIX(0.275899380));
		    tmp2 = MULTIPLY(d5, - FIX(1.387039845));
		    tmp3 = MULTIPLY(d5, FIX(0.785694958));
		}
	    }
	} else {
	    if (m0 & 1 << 3) {
		int d3 = bp[3];
		if (m0 & 1 << 1) {
		    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
		    int d1 = bp[1];
		    z5 = d1 + d3;
		    tmp3 = MULTIPLY(d1, FIX(0.211164243));
		    tmp2 = MULTIPLY(d3, - FIX(1.451774981));
		    z1 = MULTIPLY(d1, FIX(1.061594337));
		    z2 = MULTIPLY(d3, - FIX(2.172734803));
		    z4 = MULTIPLY(z5, FIX(0.785694958));
		    z5 = MULTIPLY(z5, FIX(1.175875602));

		    tmp0 = z1 - z4;
		    tmp1 = z2 + z4;
		    tmp2 += z5;
		    tmp3 += z5;
		} else {
		    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
		    tmp0 = MULTIPLY(d3, - FIX(0.785694958));
		    tmp1 = MULTIPLY(d3, - FIX(1.387039845));
		    tmp2 = MULTIPLY(d3, - FIX(0.275899379));
		    tmp3 = MULTIPLY(d3, FIX(1.175875602));
		}
	    } else {
		if (m0 & 1 << 1) {
		    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
		    int d1 = bp[1];
		    tmp0 = MULTIPLY(d1, FIX(0.275899379));
		    tmp1 = MULTIPLY(d1, FIX(0.785694958));
		    tmp2 = MULTIPLY(d1, FIX(1.175875602));
		    tmp3 = MULTIPLY(d1, FIX(1.387039845));
		} else {
		    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
		    tmp0 = tmp1 = tmp2 = tmp3 = 0;
		}
	    }
	}
    }

    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

    bp[0] =  DESCALE(tmp10 + tmp3, CONST_BITS-PASS1_BITS);
    bp[7] =  DESCALE(tmp10 - tmp3, CONST_BITS-PASS1_BITS);
    bp[1] =  DESCALE(tmp11 + tmp2, CONST_BITS-PASS1_BITS);
    bp[6] =  DESCALE(tmp11 - tmp2, CONST_BITS-PASS1_BITS);
    bp[2] =  DESCALE(tmp12 + tmp1, CONST_BITS-PASS1_BITS);
    bp[5] =  DESCALE(tmp12 - tmp1, CONST_BITS-PASS1_BITS);
    bp[3] =  DESCALE(tmp13 + tmp0, CONST_BITS-PASS1_BITS);
    bp[4] =  DESCALE(tmp13 - tmp0, CONST_BITS-PASS1_BITS);

 nextrow:
    bp += 8;		/* advance pointer to next row */
    m0 >>= 8;
    m0 |= m1 << 24;
    m1 >>= 8;
  }

  /* Pass 2: process columns. */
  /* Note that we must descale the results by a factor of 8 == 2**3, */
  /* and also undo the PASS1_BITS scaling. */

  bp -= 64;
  for (rowctr = 8; --rowctr >= 0;) {
    int tmp0, tmp1, tmp2, tmp3;
    int tmp10, tmp11, tmp12, tmp13;
    int z1, z2, z3, z4, z5;

    /* Columns of zeroes can be exploited in the same way as we did with rows.
     * However, the row calculation has created many nonzero AC terms, so the
     * simplification applies less often (typically 5% to 10% of the time).
     * On machines with very fast multiplication, it's possible that the
     * test takes more time than it's worth.  In that case this section
     * may be commented out.
     */

    int d0 = bp[8*0];
    int d1 = bp[8*1];
    int d2 = bp[8*2];
    int d3 = bp[8*3];
    int d4 = bp[8*4];
    int d5 = bp[8*5];
    int d6 = bp[8*6];
    int d7 = bp[8*7];

    /* Even part: reverse the even part of the forward DCT. */
    /* The rotator is sqrt(2)*c(-6). */
    if (d6) {
	if (d4) {
	    if (d2) {
		if (d0) {
		    /* d0 != 0, d2 != 0, d4 != 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 != 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    } else {
		if (d0) {
		    /* d0 != 0, d2 == 0, d4 != 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 == 0, d4 != 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, -FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    }
	} else {
	    if (d2) {
		if (d0) {
		    /* d0 != 0, d2 != 0, d4 == 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 == 0, d6 != 0 */
		    z1 = MULTIPLY(d2 + d6, FIX(0.541196100));
		    tmp2 = z1 + MULTIPLY(d6, - FIX(1.847759065));
		    tmp3 = z1 + MULTIPLY(d2, FIX(0.765366865));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    } else {
		if (d0) {
		    /* d0 != 0, d2 == 0, d4 == 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 == 0, d4 == 0, d6 != 0 */
		    tmp2 = MULTIPLY(d6, - FIX(1.306562965));
		    tmp3 = MULTIPLY(d6, FIX(0.541196100));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    }
	}
    } else {
	if (d4) {
	    if (d2) {
		if (d0) {
		    /* d0 != 0, d2 != 0, d4 != 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = (d0 + d4) << CONST_BITS;
		    tmp1 = (d0 - d4) << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp1 + tmp2;
		    tmp12 = tmp1 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 != 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = d4 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp2 - tmp0;
		    tmp12 = -(tmp0 + tmp2);
		}
	    } else {
		if (d0) {
		    /* d0 != 0, d2 == 0, d4 != 0, d6 == 0 */
		    tmp10 = tmp13 = (d0 + d4) << CONST_BITS;
		    tmp11 = tmp12 = (d0 - d4) << CONST_BITS;
		} else {
		    /* d0 == 0, d2 == 0, d4 != 0, d6 == 0 */
		    tmp10 = tmp13 = d4 << CONST_BITS;
		    tmp11 = tmp12 = -tmp10;
		}
	    }
	} else {
	    if (d2) {
		if (d0) {
		    /* d0 != 0, d2 != 0, d4 == 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp0 = d0 << CONST_BITS;

		    tmp10 = tmp0 + tmp3;
		    tmp13 = tmp0 - tmp3;
		    tmp11 = tmp0 + tmp2;
		    tmp12 = tmp0 - tmp2;
		} else {
		    /* d0 == 0, d2 != 0, d4 == 0, d6 == 0 */
		    tmp2 = MULTIPLY(d2, FIX(0.541196100));
		    tmp3 = MULTIPLY(d2, FIX(1.306562965));

		    tmp10 = tmp3;
		    tmp13 = -tmp3;
		    tmp11 = tmp2;
		    tmp12 = -tmp2;
		}
	    } else {
		if (d0) {
		    /* d0 != 0, d2 == 0, d4 == 0, d6 == 0 */
		    tmp10 = tmp13 = tmp11 = tmp12 = d0 << CONST_BITS;
		} else {
		    /* d0 == 0, d2 == 0, d4 == 0, d6 == 0 */
		    tmp10 = tmp13 = tmp11 = tmp12 = 0;
		}
	    }
	}
    }

    /* Odd part per figure 8; the matrix is unitary and hence its
     * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
     */
    if (d7) {
	if (d5) {
	    if (d3) {
		if (d1) {
		    /* d1 != 0, d3 != 0, d5 != 0, d7 != 0 */
		    z1 = d7 + d1;
		    z2 = d5 + d3;
		    z3 = d7 + d3;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(z3 + z4, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 != 0, d7 != 0 */
		    z1 = d7;
		    z2 = d5 + d3;
		    z3 = d7 + d3;
		    z5 = MULTIPLY(z3 + d5, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 = z1 + z4;
		}
	    } else {
		if (d1) {
		    /* d1 != 0, d3 == 0, d5 != 0, d7 != 0 */
		    z1 = d7 + d1;
		    z2 = d5;
		    z3 = d7;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(z3 + z4, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 = z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 == 0, d5 != 0, d7 != 0 */
		    tmp0 = MULTIPLY(d7, - FIX(0.601344887));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    tmp1 = MULTIPLY(d5, - FIX(0.509795578));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));
		    z5 = MULTIPLY(d5 + d7, FIX(1.175875602));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z3;
		    tmp1 += z4;
		    tmp2 = z2 + z3;
		    tmp3 = z1 + z4;
		}
	    }
	} else {
	    if (d3) {
		if (d1) {
		    /* d1 != 0, d3 != 0, d5 == 0, d7 != 0 */
		    z1 = d7 + d1;
		    z3 = d7 + d3;
		    z5 = MULTIPLY(z3 + d1, FIX(1.175875602));

		    tmp0 = MULTIPLY(d7, FIX(0.298631336));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(z1, - FIX(0.899976223));
		    z2 = MULTIPLY(d3, - FIX(2.562915447));
		    z3 = MULTIPLY(z3, - FIX(1.961570560));
		    z4 = MULTIPLY(d1, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 += z1 + z3;
		    tmp1 = z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 == 0, d7 != 0 */
		    z3 = d7 + d3;

		    tmp0 = MULTIPLY(d7, - FIX(0.601344887));
		    z1 = MULTIPLY(d7, - FIX(0.899976223));
		    tmp2 = MULTIPLY(d3, FIX(0.509795579));
		    z2 = MULTIPLY(d3, - FIX(2.562915447));
		    z5 = MULTIPLY(z3, FIX(1.175875602));
		    z3 = MULTIPLY(z3, - FIX(0.785694958));

		    tmp0 += z3;
		    tmp1 = z2 + z5;
		    tmp2 += z3;
		    tmp3 = z1 + z5;
		}
	    } else {
		if (d1) {
		    /* d1 != 0, d3 == 0, d5 == 0, d7 != 0 */
		    z1 = d7 + d1;
		    z5 = MULTIPLY(z1, FIX(1.175875602));

		    z1 = MULTIPLY(z1, FIX(0.275899379));
		    z3 = MULTIPLY(d7, - FIX(1.961570560));
		    tmp0 = MULTIPLY(d7, - FIX(1.662939224));
		    z4 = MULTIPLY(d1, - FIX(0.390180644));
		    tmp3 = MULTIPLY(d1, FIX(1.111140466));

		    tmp0 += z1;
		    tmp1 = z4 + z5;
		    tmp2 = z3 + z5;
		    tmp3 += z1;
		} else {
		    /* d1 == 0, d3 == 0, d5 == 0, d7 != 0 */
		    tmp0 = MULTIPLY(d7, - FIX(1.387039845));
		    tmp1 = MULTIPLY(d7, FIX(1.175875602));
		    tmp2 = MULTIPLY(d7, - FIX(0.785694958));
		    tmp3 = MULTIPLY(d7, FIX(0.275899379));
		}
	    }
	}
    } else {
	if (d5) {
	    if (d3) {
		if (d1) {
		    /* d1 != 0, d3 != 0, d5 != 0, d7 == 0 */
		    z2 = d5 + d3;
		    z4 = d5 + d1;
		    z5 = MULTIPLY(d3 + z4, FIX(1.175875602));

		    tmp1 = MULTIPLY(d5, FIX(2.053119869));
		    tmp2 = MULTIPLY(d3, FIX(3.072711026));
		    tmp3 = MULTIPLY(d1, FIX(1.501321110));
		    z1 = MULTIPLY(d1, - FIX(0.899976223));
		    z2 = MULTIPLY(z2, - FIX(2.562915447));
		    z3 = MULTIPLY(d3, - FIX(1.961570560));
		    z4 = MULTIPLY(z4, - FIX(0.390180644));

		    z3 += z5;
		    z4 += z5;

		    tmp0 = z1 + z3;
		    tmp1 += z2 + z4;
		    tmp2 += z2 + z3;
		    tmp3 += z1 + z4;
		} else {
		    /* d1 == 0, d3 != 0, d5 != 0, d7 == 0 */
		    z2 = d5 + d3;

		    z5 = MULTIPLY(z2, FIX(1.175875602));
		    tmp1 = MULTIPLY(d5, FIX(1.662939225));
		    z4 = MULTIPLY(d5, - FIX(0.390180644));
		    z2 = MULTIPLY(z2, - FIX(1.387039845));
		    tmp2 = MULTIPLY(d3, FIX(1.111140466));
		    z3 = MULTIPLY(d3, - FIX(1.961570560));

		    tmp0 = z3 + z5;
		    tmp1 += z2;
		    tmp2 += z2;
		    tmp3 = z4 + z5;
		}
	    } else {
		if (d1) {
		    /* d1 != 0, d3 == 0, d5 != 0, d7 == 0 */
		    z4 = d5 + d1;

		    z5 = MULTIPLY(z4, FIX(1.175875602));
		    z1 = MULTIPLY(d1, - FIX(0.899976223));
		    tmp3 = MULTIPLY(d1, FIX(0.601344887));
		    tmp1 = MULTIPLY(d5, - FIX(0.509795578));
		    z2 = MULTIPLY(d5, - FIX(2.562915447));
		    z4 = MULTIPLY(z4, FIX(0.785694958));

		    tmp0 = z1 + z5;
		    tmp1 += z4;
		    tmp2 = z2 + z5;
		    tmp3 += z4;
		} else {
		    /* d1 == 0, d3 == 0, d5 != 0, d7 == 0 */
		    tmp0 = MULTIPLY(d5, FIX(1.175875602));
		    tmp1 = MULTIPLY(d5, FIX(0.275899380));
		    tmp2 = MULTIPLY(d5, - FIX(1.387039845));
		    tmp3 = MULTIPLY(d5, FIX(0.785694958));
		}
	    }
	} else {
	    if (d3) {
		if (d1) {
		    /* d1 != 0, d3 != 0, d5 == 0, d7 == 0 */
		    z5 = d1 + d3;
		    tmp3 = MULTIPLY(d1, FIX(0.211164243));
		    tmp2 = MULTIPLY(d3, - FIX(1.451774981));
		    z1 = MULTIPLY(d1, FIX(1.061594337));
		    z2 = MULTIPLY(d3, - FIX(2.172734803));
		    z4 = MULTIPLY(z5, FIX(0.785694958));
		    z5 = MULTIPLY(z5, FIX(1.175875602));

		    tmp0 = z1 - z4;
		    tmp1 = z2 + z4;
		    tmp2 += z5;
		    tmp3 += z5;
		} else {
		    /* d1 == 0, d3 != 0, d5 == 0, d7 == 0 */
		    tmp0 = MULTIPLY(d3, - FIX(0.785694958));
		    tmp1 = MULTIPLY(d3, - FIX(1.387039845));
		    tmp2 = MULTIPLY(d3, - FIX(0.275899379));
		    tmp3 = MULTIPLY(d3, FIX(1.175875602));
		}
	    } else {
		if (d1) {
		    /* d1 != 0, d3 == 0, d5 == 0, d7 == 0 */
		    tmp0 = MULTIPLY(d1, FIX(0.275899379));
		    tmp1 = MULTIPLY(d1, FIX(0.785694958));
		    tmp2 = MULTIPLY(d1, FIX(1.175875602));
		    tmp3 = MULTIPLY(d1, FIX(1.387039845));
		} else {
		    /* d1 == 0, d3 == 0, d5 == 0, d7 == 0 */
		    tmp0 = tmp1 = tmp2 = tmp3 = 0;
		}
	    }
	}
    }

    /* Final output stage: inputs are tmp10..tmp13, tmp0..tmp3 */

    d0 = DESCALE(tmp10 + tmp3, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp11 + tmp2, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp12 + tmp1, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp13 + tmp0, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp13 - tmp0, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp12 - tmp1, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp11 - tmp2, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    d0 = DESCALE(tmp10 - tmp3, CONST_BITS+PASS1_BITS+3);
    d0 = UCLIMIT(d0 + 128, d1);
    *p = d0;
    p += stride;

    p -= stride << 3;
    ++p;
    ++bp;			/* advance pointer to next column */
  }
}

#define FIX_0_298631336  FIX(0.298631336)
#define FIX_0_390180644  FIX(0.390180644)
#define FIX_0_541196100  FIX(0.541196100)
#define FIX_0_765366865  FIX(0.765366865)
#define FIX_0_899976223  FIX(0.899976223)
#define FIX_1_175875602  FIX(1.175875602)
#define FIX_1_501321110  FIX(1.501321110)
#define FIX_1_847759065  FIX(1.847759065)
#define FIX_1_961570560  FIX(1.961570560)
#define FIX_2_053119869  FIX(2.053119869)
#define FIX_2_562915447  FIX(2.562915447)
#define FIX_3_072711026  FIX(3.072711026)

/* Even part: reverse the even part of the forward DCT. */
/* The rotator is sqrt(2)*c(-6). */
#define DCT_EVEN(bp, t10, t11, t12, t13, stride) \
{ \
	int z2 = bp[2 * stride]; \
	int z3 = bp[6 * stride]; \
	int z1 = (z2 + z3) * FIX(0.541196100); \
	int tmp2 = z1 + z3 * -FIX(1.847759065); \
	int tmp3 = z1 + z2 * FIX(0.765366865); \
 \
	int tmp0 = (bp[0] + bp[4 * stride]) << CONST_BITS; \
	int tmp1 = (bp[0] - bp[4 * stride]) << CONST_BITS; \
 \
	t10 = tmp0 + tmp3; \
	t13 = tmp0 - tmp3; \
	t11 = tmp1 + tmp2; \
	t12 = tmp1 - tmp2; \
}

/*
 * Odd part per figure 8; the matrix is unitary and hence its
 * transpose is its inverse.  i0..i3 are y7,y5,y3,y1 respectively.
 */
#define DCT_ODD(bp, t0, t1, t2, t3, stride) \
{ \
	t0 = bp[stride*7]; \
	t1 = bp[stride*5]; \
	t2 = bp[stride*3]; \
	t3 = bp[stride*1]; \
 \
	int z1 = t0 + t3; \
	int z2 = t1 + t2; \
	int z3 = t0 + t2; \
	int z4 = t1 + t3; \
	int z5 = (z3 + z4) * FIX(1.175875602); /* sqrt(2) * c3 */ \
     \
	t0 *= FIX(0.298631336);	/* sqrt(2) * (-c1+c3+c5-c7) */ \
	t1 *= FIX(2.053119869);	/* sqrt(2) * ( c1+c3-c5+c7) */ \
	t2 *= FIX(3.072711026);	/* sqrt(2) * ( c1+c3+c5-c7) */ \
	t3 *= FIX(1.501321110);	/* sqrt(2) * ( c1+c3-c5-c7) */ \
	z1 *= -FIX(0.899976223);	/* sqrt(2) * (c7-c3) */ \
	z2 *= -FIX(2.562915447);	/* sqrt(2) * (-c1-c3) */ \
	z3 *= -FIX(1.961570560);	/* sqrt(2) * (-c3-c5) */ \
	z4 *= -FIX(0.390180644);	/* sqrt(2) * (c5-c3) */ \
     \
	z3 += z5; \
	z4 += z5; \
     \
	t0 += z1 + z3; \
	t1 += z2 + z4; \
	t2 += z2 + z3; \
	t3 += z1 + z4; \
}

void
idct(register short* bp, u_int* mask, u_char* p, int stride)
{
	u_int m0 = mask[0];
	u_int m1 = mask[1];
	for (int i = 1; i < 64; ++i) {
		m0 >>= 1;
		m0 |= m1 << 31;
		m1 >>= 1;
		if ((m0 & 1) == 0)
			bp[i] = 0;
	}

	/* Pass 1: process columns. */
	/* Note results are scaled up by sqrt(8) compared to a true IDCT; */
	/* furthermore, we scale the results by 2**PASS1_BITS. */

	for (int k = 8; --k >= 0; ) {
		/*
		 * Due to quantization, we will usually find that many of
		 * the input coefficients are zero, especially the AC terms.
		 * We can exploit this by short-circuiting the IDCT
		 * calculation for any row in which all he AC terms are
		 * zero.  In that case each output is equal to the DC
		 * coefficient (with scale factor as needed).  With typical
		 * images and quantization tables, half or more of the
		 * row DCT calculations can be simplified this way.
		 */
		/*FIXME*/

		int e10, e11, e12, e13;
		DCT_EVEN(bp, e10, e11, e12, e13, 8);
		int o0, o1, o2, o3;
		DCT_ODD(bp, o0, o1, o2, o3, 8);

		/* Final output stage */

		bp[8*0] = DESCALE(e10 + o3, CONST_BITS+PASS1_BITS+3);
		bp[8*7] = DESCALE(e10 - o3, CONST_BITS+PASS1_BITS+3);
		bp[8*1] = DESCALE(e11 + o2, CONST_BITS+PASS1_BITS+3);
		bp[8*6] = DESCALE(e11 - o2, CONST_BITS+PASS1_BITS+3);
		bp[8*2] = DESCALE(e12 + o1, CONST_BITS+PASS1_BITS+3);
		bp[8*5] = DESCALE(e12 - o1, CONST_BITS+PASS1_BITS+3);
		bp[8*3] = DESCALE(e13 + o0, CONST_BITS+PASS1_BITS+3);
		bp[8*4] = DESCALE(e13 - o0, CONST_BITS+PASS1_BITS+3);

		++bp;
	}

	/* Pass 2: process rows. */
	/* Note that we must descale the results by a factor of 8 == 2**3, */
	/* and also undo the PASS1_BITS scaling. */

	bp -= 8;
	for (k = 8; --k >= 0; ) {

		int e10, e11, e12, e13;
		DCT_EVEN(bp, e10, e11, e12, e13, 1);
		int o0, o1, o2, o3;
		DCT_ODD(bp, o0, o1, o2, o3, 1);

		/* Final output stage */

		bp[0] = DESCALE(e10 + o3, CONST_BITS-PASS1_BITS);
		bp[7] = DESCALE(e10 - o3, CONST_BITS-PASS1_BITS);
		bp[1] = DESCALE(e11 + o2, CONST_BITS-PASS1_BITS);
		bp[6] = DESCALE(e11 - o2, CONST_BITS-PASS1_BITS);
		bp[2] = DESCALE(e12 + o1, CONST_BITS-PASS1_BITS);
		bp[5] = DESCALE(e12 - o1, CONST_BITS-PASS1_BITS);
		bp[3] = DESCALE(e13 + o0, CONST_BITS-PASS1_BITS);
		bp[4] = DESCALE(e13 - o0, CONST_BITS-PASS1_BITS);

		bp += 8;
	}

	bp -= 64;
	for (int y = 0; y < 8; ++y) {
		for (int x = 0; x < 8; ++x) {
			int t;
			int v = *bp++;
			p[x] = UCLIMIT(v + 128, t);
		}
		p += stride;
	}
}

/*
 * A 2D Inverse DCT based on a row-column decomposition using
 * Vetterli's 8pt 1D Inverse DCT, from Fig. 4-7 Pennebaker & Mitchell
 * (i.e., the pink JPEG book)  This figure is the forward transform.
 * Reverse the flowgraph for the inverse (you need to draw a picture).
 * To reverse the rotations, you must swap the pair of inputs and swap
 * the pair of outputs, in addition to reversing the flowgraph edges.
 * This implementation differs from the flow graph by a factor of
 * 1/cos(4pi / 16) = sqrt(2).  After both the row and column passes,
 * this is a factor of two, which we just absorb into the final
 * descaling computation.
 *
 * The input coefficients are, counter to tradition, in column-order.
 * The bit mask indicates which coefficients are non-zero.  If the
 * corresponding bit is zero, then the coefficient is assumed zero
 * and the input coefficient is not referenced and need not be defined.
 * The 8-bit outputs are computed in row order and placed in the
 * output array pointed to by p, with each of the eight 8-byte lines
 * offset by "stride" bytes.
 *
 * qt is the inverse quantization table in column order.  These
 * coefficients are the product of the inverse quantization factor,
 * specified by the jpeg quantization table, and the first multiplier
 * in the inverse DCT flow graph.
 */
void
v_rdct(register short *bp, u_int* mask, u_char* p, int stride, const int* qt)
{
	/*FIXME*/
	u_int m0 = mask[0];
	u_int m1 = mask[1];

	for (int i = 8; --i >= 0; ) {
#define M(n) (m0 & 1 << (n))
#define S(qt, n) qt[(n) << 1 | 1]
#define C(qt, n) qt[(n) << 1]
		int x0 = M(0) ? qt[0] * bp[0] : 0;
		int x1 = M(4) ? qt[4 << 1] * bp[4] : 0;
		int t2, t3;
		if (M(6)) {
			int t = bp[6];
			t3 = C(qt, 6) * t;
			t2 = S(qt, 6) * t;
		} else
			t2 = t3 = 0;
		if (M(2)) {
			int t = bp[2];
			t3 += S(qt, 2) * t;
			t2 += C(qt, 2) * t;
		}
		int x4, x5;
		if (M(7)) {
			int t = bp[7];
			x4 = -C(qt, 7) * t;
			x5 = S(qt, 7) * t;
		} else
			x4 = x5 = 0;
		if (M(1)) {
			int t = bp[1];
			x4 -= S(qt, 1) * t;
			x5 += C(qt, 1) * t;
		}
		int x6, t7;
		if (M(5)) {
			int t = bp[5];
			x6 = C(qt, 5) * t;
			t7 = S(qt, 5) * t;
		} else
			x6 = t7 = 0;
		if (M(3)) {
			int t = bp[3];
			x6 += S(qt, 3) * t;
			t7 += C(qt, 3) * t;
		}
#undef S
#undef C
#undef M
		int t0 = x0 + x1;
		int t1 = x0 - x1;
		int t4 = x4 + x6;
		int t5 = FIX(C4) * DESCALE(x5 - t7, CONST_BITS);
		int t6 = FIX(C4) * DESCALE(x4 - x6, CONST_BITS);
		t7 += x5;

		x0 = t0 + t3;
		int x3 = t0 - t3;
		x1 = t1 + t5;
		x5 = t1 - t5;
		int x2 = t6 + t2;
		x6 = t6 - t2;

		bp[0] = DESCALE(x0 + t7, CONST_BITS-PASS1_BITS);
		bp[7] = DESCALE(x0 - t7, CONST_BITS-PASS1_BITS);
		bp[1] = DESCALE(x1 + x2, CONST_BITS-PASS1_BITS);
		bp[2] = DESCALE(x1 - x2, CONST_BITS-PASS1_BITS);
		bp[3] = DESCALE(x3 + t4, CONST_BITS-PASS1_BITS);
		bp[4] = DESCALE(x3 - t4, CONST_BITS-PASS1_BITS);
		bp[5] = DESCALE(x5 + x6, CONST_BITS-PASS1_BITS);
		bp[6] = DESCALE(x5 - x6, CONST_BITS-PASS1_BITS);

		bp += 8;
		qt += 2*8;

		m0 >>= 8;
		m0 |= m1 << 24;
		m1 >>= 8;
	}
	bp -= 64;
	for (i = 8; --i >= 0; ) {
		int x2, x3;
		int x0 = bp[8*0] << CONST_BITS;
		int x1 = bp[8*4] << CONST_BITS;
		int t3 = FIX(C6 / C4) * bp[8*6] + FIX(S6 / C4) * bp[8*2];
		int t2 = -FIX(S6 / C4) * bp[8*6] + FIX(C6 / C4) * bp[8*2];
		int x4 = -(FIX(C_1 / C4) * bp[8*7] + FIX(S_1 / C4) * bp[8*1]);
		int x5 = -FIX(S_1 / C4) * bp[8*7] + FIX(C_1 / C4) * bp[8*1];
		int x6 = FIX(C_3 / C4) * bp[8*5] + FIX(S_3 / C4) * bp[8*3];
		int t7 = -FIX(S_3 / C4) * bp[8*5] + FIX(C_3 / C4) * bp[8*3];

		int t0 = x0 + x1;
		int t1 = x0 - x1;
		int t4 = x4 + x6;
		int t5 = FIX(C4) * DESCALE(x5 - t7, CONST_BITS);
		int t6 = FIX(C4) * DESCALE(x4 - x6, CONST_BITS);
		t7 += x5;

		x0 = t0 + t3;
		x3 = t0 - t3;
		x1 = t1 + t5;
		x5 = t1 - t5;
		x2 = t6 + t2;
		x6 = t6 - t2;
#ifdef INT_64
		/*FIXME assume little-endian */
		u_int v = DESCALE(x0 + t7, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		INT_64 pix = v;
		v = DESCALE(x1 + x2, CONST_BITS+PASS1_BITS+3);
		pix |= (INT_64)UCLIMIT(v + 128, t3) << 8;
		v = DESCALE(x1 - x2, CONST_BITS+PASS1_BITS+3);
		pix |= (INT_64)UCLIMIT(v + 128, t3) << 16;
		v = DESCALE(x3 + t4, CONST_BITS+PASS1_BITS+3);
		pix |= (INT_64)UCLIMIT(v + 128, t3) << 24;
		v = DESCALE(x3 - t4, CONST_BITS+PASS1_BITS+3);
		pix |= (INT_64)UCLIMIT(v + 128, t3) << 32;
		v = DESCALE(x5 + x6, CONST_BITS+PASS1_BITS+3);
		pix |= (INT_64)UCLIMIT(v + 128, t3) << 40;
		v = DESCALE(x5 - x6, CONST_BITS+PASS1_BITS+3);
		pix |= (INT_64)UCLIMIT(v + 128, t3) << 48;
		v = DESCALE(x0 - t7, CONST_BITS+PASS1_BITS+3);
		pix |= (INT_64)UCLIMIT(v + 128, t3) << 56;
		*(INT_64*)p = pix;
#else
#if BYTE_ORDER == LITTLE_ENDIAN
#define SPLICE_PIXEL(v, p, n) (v) |= (p) << (24 - (n))
#else
#define SPLICE_PIXEL(v, p, n) (v) |= (p) << (n)
#endif

		u_int v = DESCALE(x0 + t7, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		u_int pix = 0;
		SPLICE_PIXEL(pix, v, 24);
		v = DESCALE(x1 + x2, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		SPLICE_PIXEL(pix, v, 16);
		v = DESCALE(x1 - x2, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		SPLICE_PIXEL(pix, v, 8);
		v = DESCALE(x3 + t4, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		SPLICE_PIXEL(pix, v, 0);
		*(u_int*)p = pix;

		v = DESCALE(x3 - t4, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		pix = 0;
		SPLICE_PIXEL(pix, v, 24);
		v = DESCALE(x5 + x6, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		SPLICE_PIXEL(pix, v, 16);
		v = DESCALE(x5 - x6, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		SPLICE_PIXEL(pix, v, 8);
		v = DESCALE(x0 - t7, CONST_BITS+PASS1_BITS+3);
		v = UCLIMIT(v + 128, t3);
		SPLICE_PIXEL(pix, v, 0);
		*(u_int*)(p + 4) = pix;
#endif
		bp += 8;
		p += stride;
	}
}
