/*
 * color-true.cc --
 *
 *      FIXME: This file needs a description here.
 *
 * Copyright (c) 1993-2002 The Regents of the University of California.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted provided that the following conditions are met:
 *
 * A. Redistributions of source code must retain the above copyright notice,
 *    this list of conditions and the following disclaimer.
 * B. Redistributions in binary form must reproduce the above copyright notice,
 *    this list of conditions and the following disclaimer in the documentation
 *    and/or other materials provided with the distribution.
 * C. Neither the names of the copyright holders nor the names of its
 *    contributors may be used to endorse or promote products derived from this
 *    software without specific prior written permission.
 *
 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS
 * IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
 * THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
 * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE
 * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
 * POSSIBILITY OF SUCH DAMAGE.
 */

#ifndef lint
static const char rcsid[] =
    "@(#) $Header: /usr/mash/src/repository/mash/mash-1/render/color-true.cc,v 1.17 2002/02/03 04:15:14 lim Exp $";
#endif

#include <stdio.h>
#include <stdlib.h>
#ifndef WIN32
#   include <X11/Xlib.h>
#   include <X11/Xutil.h>
#endif
#include "bsd-endian.h"
#include "color.h"
#include "renderer-window.h"
#include "inet.h"
#include "tclcl.h"
#include "vw.h"

//#ifdef WIN32
//typedef RGBTRIPLE* RGBPointer;
//#else
typedef u_int* RGBPointer;
//#endif

#if MMX_CSCONV_ENABLED
u_int64_t const128 = 0x0080008000800080LL;
u_int64_t empty = 0x0000000000000000LL;
u_int64_t davemask = 0x0000FFFFFFFF0000LL;

u_int64_t const1 = 0x59BA0000D24B59BALL; // Cr_r Cr_b Cr_g Cr_r
u_int64_t const2 = 0x00007168E9FA0000LL; // Cb-r Cb_b Cb_g Cb_r
u_int64_t const5 = 0x0000D24B59BA0000LL; // Cr_b Cr_g Cr_r Cr_b
u_int64_t const6 = 0x7168E9FA00007168LL; // Cb_b Cb_g Cb_r Cb_b

// constants for factors (One_Half/fix(x)) << 2
u_int64_t const05 = 0x0001000000000001LL; // Cr_r Cr_b Cr_g Cr_r
u_int64_t const15 = 0x00000001FFFA0000LL; // Cb-r Cb_b Cb_g Cb_r
u_int64_t const45 = 0x0000000000010000LL; // Cr_b Cr_g Cr_r Cr_b
u_int64_t const55 = 0x0001FFFA00000001LL; // Cb_b Cb_g Cb_r Cb_b
int picsize __asm__("picsize");
int segwidth __asm__("segwidth");
int cstride __asm__("cstride");
int pstride __asm__("pstride");
int pstride2 __asm__("pstride2");
int segwidthconst __asm__("segwidthconst");
//u_int64_t outcheck __asm__("outcheck"); // used to debug output
#endif

class TrueColorModel : public ColorModel {
public:
	~TrueColorModel();
	virtual int alloc_colors();
	virtual int alloc_grays();
	inline u_int omask() const { return (omask_); }
	inline u_int pmask() const { return (pmask_); }
	inline const u_int* uvtab() const { return (&uvtab_[0]); }
protected:
	u_int omask_;
	u_int pmask_;
	u_int uvtab_[65536];


};

class TrueColorClass : public TclClass {
public:
	TrueColorClass() : TclClass("Colormodel/TrueColor/24") {}
	TclObject* create(int /* argc */, const char*const* /* argv */) {
		return (new TrueColorModel());
	}
} truecolor_class;

TrueColorModel::~TrueColorModel()
{
	/*FIXME*/
}

static int
mtos(int mask)
{
	int shift = 0;
	if (mask) {
		while ((mask & 1) == 0) {
			mask >>= 1;
			++shift;
		}
	}
	return (shift);
}

int TrueColorModel::alloc_grays()
{
	return (0);
}

int TrueColorModel::alloc_colors()
{
	u_int rmask = visual_->red_mask;
	u_int gmask = visual_->green_mask;
	u_int bmask = visual_->blue_mask;

	/*
	 * Handling byte ordering here is a pain -- Xlib
	 * will always send an image in the byte order the
	 * server has asked for.  We can override the
	 * byte_order field in the XImage struct so it
	 * works if we render in the native byte order but
	 * if there's a mismatch, Xlib will swap bytes
	 * before sending the image to the server.
	 *
	 * Since we're just doing table lookups here, we
	 * may as well get the byte ordering correct and
	 * render directly in to the format the server
	 * wants.  This code is used only for 8 bit per
	 * pixel images so we can just swap the masks
	 * before computing the table entries.  Note that
	 * we can't use htonl() since we may have to do
	 * byte swapping on a big-endian machine connected
	 * to an X server that wants little-endian images.
	 */

#if BYTE_ORDER == LITTLE_ENDIAN
	if (ImageByteOrder(dpy_) == MSBFirst) {
		rmask = htonl(rmask);
		gmask = htonl(gmask);
		bmask = htonl(bmask);
	}
#else
	if (ImageByteOrder(dpy_) == LSBFirst) {
#define SWAP32(x) \
	(x) = (((x)&0x000000ff)<<24) | (((x)&0x0000ff00)<<8) \
	| (((x)&0x00ff0000)>>8) | (((x)&0xff000000)>>24)
		SWAP32(rmask);
		SWAP32(gmask);
		SWAP32(bmask);
	}
#endif
	u_int rshft = mtos(rmask);
	u_int rlose = 8 - mtos(~(rmask >> rshft));
	u_int gshft = mtos(gmask);
	u_int glose = 8 - mtos(~(gmask >> gshft));
	u_int bshft = mtos(bmask);
	u_int blose = 8 - mtos(~(bmask >> bshft));

	omask_ = 0x80 >> rlose << rshft;
	omask_ |= 0x80 >> glose << gshft;
	omask_ |= 0x80 >> blose << bshft;

	pmask_ = 0xff >> rlose << rshft;
	pmask_ |= 0xff >> glose << gshft;
	pmask_ |= 0xff >> blose << bshft;

	for (int u = 0; u < 256; ++u) {
		register double uf = double(u - 128);
		for (int v = 0; v < 256; ++v) {
			register double vf = double(v - 128);
			int r = int(vf * 1.402);
			r = (r < -128)? -128 : (r > 127)? 127 : r;
			int b = int(uf * 1.772);
			b = (b < -128)? -128 : (b > 127)? 127 : b;
			int g = int(uf * -0.34414 - vf * 0.71414);
			g = (g < -128)? -128 : (g > 127)? 127 : g;
			uvtab_[(u << 8)|v] =
				(r & 0xff) >> rlose << rshft |
				(g & 0xff) >> glose << gshft |
				(b & 0xff) >> blose << bshft;
		}
	}
	/* used to test speed of procs */

	return (0);
}

class TrueWindowRenderer : public WindowDitherer {
public:
	TrueWindowRenderer(VideoWindow* vw, int decimation, int heuristics, TrueColorModel& cm)
		: WindowDitherer(vw, decimation, heuristics), cm_(cm) { }
	virtual void render(const u_char* frm, int off, int x, int w, int h) = 0;
protected:
	TrueColorModel& cm_;
	virtual void update() = 0;
	virtual void disable() = 0;
};

class TrueWindowRenderer24;
typedef void (TrueWindowRenderer24::*True24Method)(const u_char*, u_int,
					   u_int, u_int, u_int) const;

class TrueWindowRenderer24 : public TrueWindowRenderer {
public:
    TrueWindowRenderer24(VideoWindow* vw, int decimation, int heuristics, TrueColorModel& cm)
      : TrueWindowRenderer(vw, decimation, heuristics, cm) {}
    virtual void render(const u_char* frm, int off, int x, int w, int h) {
	(this->*method_)(frm, off, x, w, h);
    }
protected:
	virtual void update();
	virtual void disable() { method_ = True24Method(&WindowRenderer::dither_null); }
	True24Method method_;

	void map_422(const u_char* frm, u_int off, u_int x,
			     u_int width, u_int height) const;
	void map_down2_422(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_down4_422(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_down_422(const u_char* frm, u_int off, u_int x,
				  u_int width, u_int height) const;
	void map_up2_422(const u_char* frm, u_int off, u_int x,
				 u_int width, u_int height) const;
	void map_411(const u_char* frm, u_int off, u_int x,
			     u_int width, u_int height) const;
	void map_down2_411(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_down4_411(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_down_411(const u_char* frm, u_int off, u_int x,
				  u_int width, u_int height) const;
	void map_up2_411(const u_char* frm, u_int off, u_int x,
				 u_int width, u_int height) const;
	void map_gray(const u_char* frm, u_int off, u_int x,
			      u_int width, u_int height) const;
	void map_gray_down2(const u_char* frm, u_int off, u_int x,
				    u_int width, u_int height) const;
	void map_gray_down4(const u_char* frm, u_int off, u_int x,
				    u_int width, u_int height) const;
	void map_gray_down(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_gray_up2(const u_char* frm, u_int off, u_int x,
				  u_int width, u_int height) const;
#if MMX_CSCONV_ENABLED
        inline void map_422_asm(const u_char* yp, const u_char* up,
				const u_char* vp, char* xip) const;
        inline void map_411_asm(const u_char* yp, const u_char* up,
				const u_char* vp, char* xip, u_int iw) const;
#endif
};

class TrueWindowRenderer32;
typedef void (TrueWindowRenderer32::*True32Method)(const u_char*, u_int,
					   u_int, u_int, u_int) const;

class TrueWindowRenderer32 : public TrueWindowRenderer {
public:
    TrueWindowRenderer32(VideoWindow* vw, int decimation, int heuristics, TrueColorModel& cm)
	: TrueWindowRenderer(vw, decimation, heuristics, cm) {}
    virtual void render(const u_char* frm, int off, int x, int w, int h) {
	(this->*method_)(frm, off, x, w, h);
    }
protected:
	virtual void update();
	virtual void disable() { method_ = True32Method(&WindowRenderer::dither_null); }
	True32Method method_;

	void map_422(const u_char* frm, u_int off, u_int x,
			     u_int width, u_int height) const;
	void map_down2_422(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_down4_422(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_down_422(const u_char* frm, u_int off, u_int x,
				  u_int width, u_int height) const;
	void map_up2_422(const u_char* frm, u_int off, u_int x,
				 u_int width, u_int height) const;
	void map_411(const u_char* frm, u_int off, u_int x,
			     u_int width, u_int height) const;
	void map_down2_411(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_down4_411(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_down_411(const u_char* frm, u_int off, u_int x,
				  u_int width, u_int height) const;
	void map_up2_411(const u_char* frm, u_int off, u_int x,
				 u_int width, u_int height) const;
	void map_gray(const u_char* frm, u_int off, u_int x,
			      u_int width, u_int height) const;
	void map_gray_down2(const u_char* frm, u_int off, u_int x,
				    u_int width, u_int height) const;
	void map_gray_down4(const u_char* frm, u_int off, u_int x,
				    u_int width, u_int height) const;
	void map_gray_down(const u_char* frm, u_int off, u_int x,
				   u_int width, u_int height) const;
	void map_gray_up2(const u_char* frm, u_int off, u_int x,
				  u_int width, u_int height) const;
#if MMX_CSCONV_ENABLED
        inline void map_422_asm(const u_char* yp, const u_char* up,
				const u_char* vp, RGBPointer xip) const;
        inline void map_411_asm(const u_char* yp, const u_char* up, const
				u_char* vp, RGBPointer xip, u_int iw) const;
#endif
};

static class TrueRendererClass : public TclClass {
public:
	TrueRendererClass() : TclClass("Renderer/TrueColor/24") {}
	TclObject* create(int argc, const char*const* argv) {
		if (argc != 8)
			abort();
		TrueColorModel* cm = (TrueColorModel*)
			TclObject::lookup(argv[4]);
		VideoWindow* vw = VideoWindow::lookup(argv[5]);
		int decimation = atoi(argv[6]);
		int heuristics = atoi(argv[7]);
		if (vw->bpp() == 24)
		    return (new TrueWindowRenderer24(vw, decimation, heuristics, *cm));
		if (vw->bpp() == 32)
		    return (new TrueWindowRenderer32(vw, decimation, heuristics, *cm));
		abort();  // FIXME
                return NULL;
	}
} truerenderer_class;

void TrueWindowRenderer24::update()
{
	static True24Method methods[] = {
	    &TrueWindowRenderer24::map_up2_411,
	    &TrueWindowRenderer24::map_up2_422,
	    &TrueWindowRenderer24::map_gray_up2,
	    &TrueWindowRenderer24::map_gray_up2,
	    &TrueWindowRenderer24::map_411,
	    &TrueWindowRenderer24::map_422,
	    &TrueWindowRenderer24::map_gray,
	    &TrueWindowRenderer24::map_gray,
	    &TrueWindowRenderer24::map_down2_411,
	    &TrueWindowRenderer24::map_down2_422,
	    &TrueWindowRenderer24::map_gray_down2,
	    &TrueWindowRenderer24::map_gray_down2,
	    &TrueWindowRenderer24::map_down4_411,
	    &TrueWindowRenderer24::map_down4_422,
	    &TrueWindowRenderer24::map_gray_down4,
	    &TrueWindowRenderer24::map_gray_down4,
	    &TrueWindowRenderer24::map_down_411,
	    &TrueWindowRenderer24::map_down_422,
	    &TrueWindowRenderer24::map_gray_down,
	    &TrueWindowRenderer24::map_gray_down,
	};
	method_ = methods[index()];
}

void TrueWindowRenderer32::update()
{
	static True32Method methods[] = {
	    &TrueWindowRenderer32::map_up2_411,
	    &TrueWindowRenderer32::map_up2_422,
	    &TrueWindowRenderer32::map_gray_up2,
	    &TrueWindowRenderer32::map_gray_up2,
	    &TrueWindowRenderer32::map_411,
	    &TrueWindowRenderer32::map_422,
	    &TrueWindowRenderer32::map_gray,
	    &TrueWindowRenderer32::map_gray,
	    &TrueWindowRenderer32::map_down2_411,
	    &TrueWindowRenderer32::map_down2_422,
	    &TrueWindowRenderer32::map_gray_down2,
	    &TrueWindowRenderer32::map_gray_down2,
	    &TrueWindowRenderer32::map_down4_411,
	    &TrueWindowRenderer32::map_down4_422,
	    &TrueWindowRenderer32::map_gray_down4,
	    &TrueWindowRenderer32::map_gray_down4,
	    &TrueWindowRenderer32::map_down_411,
	    &TrueWindowRenderer32::map_down_422,
	    &TrueWindowRenderer32::map_gray_down,
	    &TrueWindowRenderer32::map_gray_down,
	};
	method_ = methods[index()];
}

#if BYTE_ORDER == LITTLE_ENDIAN
#define SHIFT_0		24
#define SHIFT_8		16
#define SHIFT_16	8
#define SHIFT_24	0
#define UV0 ((v << 5) & 0x3ff00)
#define UV1 ((u << 2) & 0x3ff00)
#define UV2 ((v >> 11) & 0x3ff00)
#define UV3 ((u >> 14) & 0x3ff00)
#else
#define SHIFT_0		0
#define SHIFT_8		8
#define SHIFT_16	16
#define SHIFT_24	24
#define UV0 ((u >> 12) & 0xfff00)
#define UV1 ((v >> 10) & 0xfff00)
#define UV2 ((u << 4) & 0xfff00)
#define UV3 ((v << 6) & 0xfff00)
#endif

/*
 * This routine sums the luma & chroma components of one pixel &
 * constructs an rgb output.  It does all three r g b components
 * in parallel.  The one complication is that it has to
 * deal with overflow (sum > 255) and underflow (sum < 0).  Underflow
 * & overflow are only possible if both terms have the same sign and
 * are indicated by the result having a different sign than the terms.
 * Note that we ignore the carry into the next byte's lsb that happens
 * on an overflow/underflow on the grounds that it's probably invisible.
 * The luma term and sum are biased by 128 so a negative number has the
 * 2^7 bit = 0.  The chroma term is not biased so a negative number has
 * the 2^7 bit = 1.  So underflow is indicated by (L & C & sum) != 0;
 */
// #ifdef WIN32

// #define ONERGB(dst, rgb) /* \ */
// 	(dst).rgbtBlue = GetBValue(rgb); /* \ */
// 	(dst).rgbtGreen = GetGValue(rgb); /* \ */
// 	(dst).rgbtRed = GetRValue(rgb);

// #define ONEGRAY(dst, pix) /* \ */
// 	(dst).rgbtBlue = (dst).rgbtGreen = (dst).rgbtRed = (pix & 0xff);

// #else /* !WIN32 */

#define ONERGB(dst, rgb) \
	dst = rgb;

#define ONEGRAY(dst, pix) \
	(dst) =  (pix << 16) | (pix << 8) | pix;

// #endif /* !WIN32 */

#define ONEPIX(src, dst) { \
	l = src; \
	l |= l << 8; l |= l << 16; \
	sum = l + uv; \
	uflo = (l ^ uv) & (l ^ sum) & omask; \
	if (uflo) { \
		if ((l = uflo & l) != 0) { \
			/* saturate overflow(s) */ \
			l |= l >> 1; \
			l |= l >> 2; \
			l |= l >> 4; \
			sum |= l; \
			uflo &=~ l; \
		} \
		if (uflo != 0) { \
			/* zero underflow(s) */ \
			uflo |= uflo >> 1; \
			uflo |= uflo >> 2; \
			uflo |= uflo >> 4; \
			sum &=~ uflo; \
		} \
	} \
	ONERGB(dst, sum & pmask); \
}

void TrueWindowRenderer32::map_422(const u_char* frm, u_int off, u_int /* x */,
                                   u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register RGBPointer xip = (RGBPointer)pixbuf_ + off;
	register int w = width;

#if MMX_CSCONV_ENABLED

	pstride = iw - w;
	cstride = pstride >> 1;
	pstride2 = pstride << 2;
	picsize = w*height;
	segwidth = width;
	segwidthconst = segwidth;
	map_422_asm(yp, up, vp, xip);

#else

	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

#define TWO422(n) \
		uv = yuv2rgb[(up[(n)/2] << 8) | vp[(n)/2]]; \
		ONEPIX(yp[(n)], xip[(n)]) \
		ONEPIX(yp[(n)+1], xip[(n)+1])

		TWO422(0)
		TWO422(2)
		TWO422(4)
		TWO422(6)

		xip += 8;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = iw - w;
			register int cstride = pstride >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += pstride;
		}
	}

#endif
}

void TrueWindowRenderer32::map_down2_422(const u_char* frm,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register RGBPointer xip = (RGBPointer)pixbuf_ + ((off - x) >> 2) + (x >> 2);
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> 1; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

#define ONE422(n) \
		uv = yuv2rgb[(up[(n)/2] << 8) | vp[(n)/2]]; \
		ONEPIX(yp[(n)], xip[(n)/2])

		ONE422(0)
		ONE422(2)
		ONE422(4)
		ONE422(6)

		xip += 4;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			register int cstride = pstride >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += (iw - w) >> 1;
		}
	}
}


void TrueWindowRenderer32::map_down4_422(const u_char* frm,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register RGBPointer xip = (RGBPointer)pixbuf_ + ((off - x) >> 4) + (x >> 2);
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> 2; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		ONEPIX(yp[0], xip[0])
		uv = yuv2rgb[(up[2] << 8) | vp[2]];
		ONEPIX(yp[4], xip[1])

		xip += 2;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 4 * iw - w;
			register int cstride = pstride >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += (iw - w) >> 2;
		}
	}
}

/*
 * decimate by some power of 2 >= 2^3.
 */
void TrueWindowRenderer32::map_down_422(const u_char* frm,
				      u_int off, u_int x,
				      u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register int s = scale_;
	register int istride = 1 << s;
	register RGBPointer xip = (RGBPointer)pixbuf_ +
		((off - x) >> (s + s)) + (x >> s);
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> s; len > 0; len -= istride) {
		register u_int l, uv;
		register u_int uflo, sum;

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		ONEPIX(yp[0], xip[0])

		xip += 1;
		yp += istride;
		up += istride >> 1;
		vp += istride >> 1;

		w -= istride;
		if (w <= 0) {
			w = width;
			register int pstride = (iw << s) - w;
			register int cstride = pstride >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += (iw - w) >> s;
		}
	}
}

void TrueWindowRenderer32::map_up2_422(const u_char* frm,
				     u_int off, u_int x,
				     u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register RGBPointer xip = (RGBPointer)pixbuf_ + ((off - x) << 2) + (x << 1);
	register int w = width;
	register u_int e1 = yp[0];
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height; len > 0; len -= 2) {
		register u_int l, uv;
		register u_int uflo, sum;
		register u_int e2;
		register RGBPointer xip2 = xip + (iw << 1);

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		e2 = yp[0];
		ONEPIX((e1 + e2) >> 1, xip[0])
		ONERGB(xip2[0], sum);
		ONEPIX(e2, xip[1])
		ONERGB(xip2[1], sum);
		e1 = yp[1];
		ONEPIX((e1 + e2) >> 1, xip[2])
		ONERGB(xip2[2], sum);
		ONEPIX(e1, xip[3])
		ONERGB(xip2[3], sum);

		xip += 4;
		yp += 2;
		up += 1;
		vp += 1;

		w -= 2;
		if (w <= 0) {
			w = width;
			register u_int pstride = iw - w;
			register u_int cstride = pstride >> 1;
			yp += pstride;
			e1 = yp[0];
			up += cstride;
			vp += cstride;
			xip += (iw + pstride) << 1;
		}
	}
}

void TrueWindowRenderer32::map_411(const u_char* frm, u_int off,
				 u_int x, u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	register RGBPointer xip = (RGBPointer)pixbuf_ + off;
	register int w = width;

#if MMX_CSCONV_ENABLED

	picsize = w*height;
	segwidth = width;
	segwidthconst = width;
	pstride = 2 * iw - w;
	cstride = (iw - w) >> 1;
	pstride2 = pstride << 2;
	map_411_asm(yp, up, vp, xip, iw);

#else

	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;
		register RGBPointer xip2 = xip + iw;
		register const u_char* yp2 = yp + iw;

#define FOUR411(n) \
		uv = yuv2rgb[(up[(n)/2] << 8) | vp[(n)/2]]; \
		ONEPIX(yp[(n)], xip[(n)]) \
		ONEPIX(yp[(n)+1], xip[(n)+1]) \
		ONEPIX(yp2[(n)], xip2[(n)]) \
		ONEPIX(yp2[(n)+1], xip2[(n)+1])

		FOUR411(0)
		FOUR411(2)

		xip += 4;
		yp += 4;
		up += 2;
		vp += 2;

		w -= 4;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			register int cstride = (iw - w) >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += pstride;
		}
	}

#endif
}

#ifdef notdef
void TrueWindowRenderer32::map_411_16(const u_char* frm, u_int off,
				    u_int x, u_int width, u_int height) const
{
#ifdef FIX_THIS
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	/*FIXME use u_int here*/
	register u_short* xip = (u_short*)pixbuf_ + off;
	register int w = width;
	register const u_char* yuv2rgb = (u_char*)cm_.yuv2rgb();
	for (register int len = width * height; len > 0; len -= 16) {
		register u_int u = *(const u_int*)up;
		register u_int v = *(const u_int*)vp;
		register u_int y1, y2;
		register const u_char* y2r;

		y1 = (u & 0xf800f800) | ((v & 0xf800f800) >> 5);
		v = ((u & 0x00f800f8) << 5) | (v & 0x00f800f8);
		u = y1;

		y1 = *(const u_int*)yp;
		y2r = yuv2rgb + UV0;
		xip[0] =  *(u_int*)(y2r + ((y1 >> SHIFT_24) & 0xfc));
		xip[1] =  *(u_int*)(y2r + ((y1 >> SHIFT_16) & 0xfc));
		y2 = *(const u_int*)(yp + iw);
		register u_short* xip2 = xip + iw;
		xip2[0] = *(u_int*)(y2r + ((y2 >> SHIFT_24) & 0xfc));
		xip2[1] = *(u_int*)(y2r + ((y2 >> SHIFT_16) & 0xfc));

		y2r = yuv2rgb + UV1;
		xip[2] =  *(u_int*)(y2r + ((y1 >> SHIFT_8) & 0xfc));
		xip[3] =  *(u_int*)(y2r + ((y1 >> SHIFT_0) & 0xfc));
		xip2[2] = *(u_int*)(y2r + ((y2 >> SHIFT_8) & 0xfc));
		xip2[3] = *(u_int*)(y2r + ((y2 >> SHIFT_0) & 0xfc));

		y1 = *(const u_int*)(yp + 4);
		y2r = yuv2rgb + UV2;
		xip[4] =  *(u_int*)(y2r + ((y1 >> SHIFT_24) & 0xfc));
		xip[5] =  *(u_int*)(y2r + ((y1 >> SHIFT_16) & 0xfc));
		y2 = *(const u_int*)(yp + iw + 4);
		xip2[4] = *(u_int*)(y2r + ((y2 >> SHIFT_24) & 0xfc));
		xip2[5] = *(u_int*)(y2r + ((y2 >> SHIFT_16) & 0xfc));

		y2r = yuv2rgb + UV3;
		xip[6] =  *(u_int*)(y2r + ((y1 >> SHIFT_8) & 0xfc));
		xip[7] =  *(u_int*)(y2r + ((y1 >> SHIFT_0) & 0xfc));
		xip2[6] = *(u_int*)(y2r + ((y2 >> SHIFT_8) & 0xfc));
		xip2[7] = *(u_int*)(y2r + ((y2 >> SHIFT_0) & 0xfc));

		xip += 8;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			register int cstride = (iw - w) >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += pstride;
		}
	}
#endif
}
#endif

void TrueWindowRenderer32::map_down2_411(const u_char* frm,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	off = ((off - x) >> 2) + (x >> 1);
	register const u_char* up = frm + framesize_ + off;
	register const u_char* vp = up + (framesize_ >> 2);
	register RGBPointer xip = (RGBPointer)pixbuf_ + off;
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> 1; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

#define ONE411(n) \
		uv = yuv2rgb[(up[(n)/2] << 8) | vp[(n)/2]]; \
		ONEPIX(yp[(n)], xip[(n)/2])

		ONE411(0)
		ONE411(2)
		ONE411(4)
		ONE411(6)

		xip += 4;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			register int cstride = (iw - w) >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += cstride;
		}
	}
}

void TrueWindowRenderer32::map_down4_411(const u_char* frm,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	register RGBPointer xip = (RGBPointer)pixbuf_ + ((off - x) >> 4) + (x >> 2);
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> 2; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		ONEPIX(yp[0], xip[0])
		uv = yuv2rgb[(up[2] << 8) | vp[2]];
		ONEPIX(yp[4], xip[1])

		xip += 2;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 4 * iw - w;
			register int cstride = iw - (w >> 1);
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += (iw - w) >> 2;
		}
	}
}

/*
 * decimate by some power of 2 >= 2^3.
 */
void TrueWindowRenderer32::map_down_411(const u_char* frm,
				      u_int off, u_int x,
				      u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	register int s = scale_;
	register int istride = 1 << s;
	register RGBPointer xip = (RGBPointer)pixbuf_
		+ ((off - x) >> (s + s)) + (x >> s);
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> s; len > 0; len -= istride) {
		register u_int l, uv;
		register u_int uflo, sum;

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		ONEPIX(yp[0], xip[0])

		yp += istride;
		up += istride >> 1;
		vp += istride >> 1;

		w -= istride;
		if (w <= 0) {
			w = width;
			register int pstride = (iw << s) - w;
			register int cstride = (iw << (s - 1)) - (w >> 1);
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += (iw - w) >> s;
		}
	}
}

void TrueWindowRenderer32::map_up2_411(const u_char* frm,
				     u_int off, u_int x,
				     u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	register RGBPointer xip = (RGBPointer)pixbuf_ + ((off - x) << 2) + (x << 1);
	register int w = width;
	register u_int e1 = yp[0], o1 = yp[iw];
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height; len > 0; len -= 4) {
		register u_int l, uv;
		register u_int uflo, sum;
		register u_int e2, o2;
		register const u_char* yp2 = yp + iw;
		register RGBPointer xip2 = xip + (iw << 1);
		register RGBPointer xip3 = xip2 + (iw << 1);
		register RGBPointer xip4 = xip3 + (iw << 1);

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		e2 = yp[0];
		ONEPIX((e1 + e2) >> 1, xip[0])
		ONERGB(xip2[0], sum);
		ONEPIX(e2, xip[1])
		ONERGB(xip2[1], sum);
		e1 = yp[1];
		ONEPIX((e1 + e2) >> 1, xip[2])
		ONERGB(xip2[2], sum);
		ONEPIX(e1, xip[3])
		ONERGB(xip2[3], sum);

		o2 = yp2[0];
		ONEPIX((o1 + o2) >> 1, xip3[0])
		ONERGB(xip4[0], sum);
		ONEPIX(o2, xip3[1])
		ONERGB(xip4[1], sum);
		o1 = yp2[1];
		ONEPIX((o1 + o2) >> 1, xip3[2])
		ONERGB(xip4[2], sum);
		ONEPIX(o1, xip3[3])
		ONERGB(xip4[3], sum);


		xip += 4;
		yp += 2;
		up += 1;
		vp += 1;

		w -= 2;
		if (w <= 0) {
			w = width;
			register u_int pstride = 2 * iw - w;
			register u_int cstride = (iw - w) >> 1;
			yp += pstride;
			e1 = yp[0];
			o1 = yp[iw];
			up += cstride;
			vp += cstride;
			xip += 8 * iw - 2 * w;
		}
	}
}

void TrueWindowRenderer32::map_gray(register const u_char *yp,
				  u_int off, u_int /* x */,
				  u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	register RGBPointer xip = (RGBPointer)pixbuf_ + off;
	register int w = width;
	for (register int len = w * height; len > 0; len -= 8) {
		register u_int y1;
		register u_int pix;

		y1 = *(const u_int*)yp;
		pix = (y1 >> SHIFT_24) & 0xff;
		ONEGRAY(xip[0], pix);
		pix = (y1 >> SHIFT_16) & 0xff;
		ONEGRAY(xip[1], pix);
		pix = (y1 >> SHIFT_8) & 0xff;
		ONEGRAY(xip[2], pix);
		pix = (y1 >> SHIFT_0) & 0xff;
		ONEGRAY(xip[3], pix);

		y1 = *(const u_int*)(yp + 4);
		pix = (y1 >> SHIFT_24) & 0xff;
		ONEGRAY(xip[4], pix);
		pix = (y1 >> SHIFT_16) & 0xff;
		ONEGRAY(xip[5], pix);
		pix = (y1 >> SHIFT_8) & 0xff;
		ONEGRAY(xip[6], pix);
		pix = (y1 >> SHIFT_0) & 0xff;
		ONEGRAY(xip[7], pix);

		xip += 8;
		yp += 8;

		w -= 8;
		if (w <= 0) {
			w = width;
			register u_int pstride = iw - w;
			yp += pstride;
			xip += pstride;
		}
	}
}

void TrueWindowRenderer32::map_gray_down2(register const u_char *yp,
					u_int off, u_int x,
					u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	off = ((off - x) >> 2) + (x >> 1);
	register RGBPointer xip = (RGBPointer)pixbuf_ + off;
	register int w = width;
	for (register int len = w * height >> 1; len > 0; len -= 8) {
		register u_int y1;
		register u_int pix;

		y1 = *(const u_int*)yp;
		pix = (y1 >> SHIFT_24) & 0xff;
		ONEGRAY(xip[0], pix);
		pix = (y1 >> SHIFT_8) & 0xff;
		ONEGRAY(xip[1], pix);

		y1 = *(const u_int*)(yp + 4);
		pix = (y1 >> SHIFT_24) & 0xff;
		ONEGRAY(xip[2], pix);
		pix = (y1 >> SHIFT_8) & 0xff;
		ONEGRAY(xip[3], pix);

		xip += 4;
		yp += 8;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			yp += pstride;
			xip += (iw - w) >> 1;
		}
	}
}

void TrueWindowRenderer32::map_gray_down4(register const u_char *yp,
					u_int off, u_int x,
					u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	register RGBPointer xip = (RGBPointer)pixbuf_ + ((off - x) >> 4) + (x >> 2);
	register int w = width;
	for (register int len = w * height >> 2; len > 0; len -= 8) {
		register u_int pix;

		pix = yp[0];
		ONEGRAY(xip[0], pix);
		pix = yp[4];
		ONEGRAY(xip[1], pix);

		xip += 2;
		yp += 8;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 4 * iw - w;
			yp += pstride;
			xip += (iw - w) >> 2;
		}
	}
}

void TrueWindowRenderer32::map_gray_down(register const u_char *yp,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	register int s = scale_;
	register int istride = 1 << s;
	register RGBPointer xip = (RGBPointer)pixbuf_ +
		((off - x) >> (s + s)) + (x >> s);
	register int w = width;
	for (register int len = w * height >> s; len > 0; len -= istride) {
		register u_int pix = *yp;
		ONEGRAY(xip[0], pix);
		xip++;
		yp += istride;
		w -= istride;
		if (w <= 0) {
			w = width;
			register int pstride = (iw << s) - w;
			yp += pstride;
			xip += (iw - w) >> s;
		}
	}
}

void TrueWindowRenderer32::map_gray_up2(register const u_char *yp,
				      u_int off, u_int x,
				      u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	register RGBPointer xip = (RGBPointer)pixbuf_ + ((off - x) << 2) + (x << 1);
	register int w = width;
	register u_int e1 = yp[0];

	for (register int len = width * height; len > 0; len -= 8) {
		register u_int y1, e2, pix;
		register RGBPointer xip2 = xip + iw * 2;

		y1 = *(const u_int*)yp;
		e2 = (y1 >> SHIFT_24) & 0xff;
		pix = (e1 + e2) >> 1;
		ONEGRAY(xip[0], pix);
		ONEGRAY(xip2[0], pix);
		ONEGRAY(xip[1], e2);
		ONEGRAY(xip2[1], e2);
		e1 = (y1 >> SHIFT_16) & 0xff;
		pix = (e1 + e2) >> 1;
		ONEGRAY(xip[2], pix);
		ONEGRAY(xip2[2], pix);
		ONEGRAY(xip[3], e1);
		ONEGRAY(xip2[3], e1);

		e2 = (y1 >> SHIFT_8) & 0xff;
		pix = (e1 + e2) >> 1;
		ONEGRAY(xip[4], pix);
		ONEGRAY(xip2[4], pix);
		ONEGRAY(xip[5], e2);
		ONEGRAY(xip2[5], e2);
		e1 = (y1 >> SHIFT_0) & 0xff;
		pix = (e1 + e2) >> 1;
		ONEGRAY(xip[6], pix);
		ONEGRAY(xip2[6], pix);
		ONEGRAY(xip[7], e1);
		ONEGRAY(xip2[7], e1);

		y1 = *(const u_int*)(yp + 4);
		e2 = (y1 >> SHIFT_24) & 0xff;
		pix = (e1 + e2) >> 1;
		ONEGRAY(xip[8], pix);
		ONEGRAY(xip2[8], pix);
		ONEGRAY(xip[9], e2);
		ONEGRAY(xip2[9], e2);
		e1 = (y1 >> SHIFT_16) & 0xff;
		pix = (e1 + e2) >> 1;
		ONEGRAY(xip[10], pix);
		ONEGRAY(xip2[10], pix);
		ONEGRAY(xip[11], e1);
		ONEGRAY(xip2[11], e1);

		e2 = (y1 >> SHIFT_8) & 0xff;
		pix = (e1 + e2) >> 1;
		ONEGRAY(xip[12], pix);
		ONEGRAY(xip2[12], pix);
		ONEGRAY(xip[13], e2);
		ONEGRAY(xip2[13], e2);
		e1 = (y1 >> SHIFT_0) & 0xff;
		pix = (e1 + e2) >> 1;
		ONEGRAY(xip[14], pix);
		ONEGRAY(xip2[14], pix);
		ONEGRAY(xip[15], e1);
		ONEGRAY(xip2[15], e1);

		xip += 16;
		yp += 8;

		w -= 8;
		if (w <= 0) {
			w = width;
			register u_int pstride = iw - w;
			yp += pstride;
			e1 = yp[0];
			xip += (iw + pstride) << 1;
		}
	}
}

// Dithers for 24 bpp displays
// FIXME might be possible to replace char* by u_int* somehow
//     or to do other optimizations

#if BYTE_ORDER == LITTLE_ENDIAN
#define PONERGB(dst, rgb)						\
    (&(dst))[0] = (rgb);						\
    (&(dst))[1] = (rgb)>>8;						\
    (&(dst))[2] = (rgb)>>16;

#else
#define PONERGB(dst, rgb)						\
    (&(dst))[0] = (rgb)>>16;						\
    (&(dst))[1] = (rgb)>>8;						\
    (&(dst))[2] = (rgb);
#endif

#define PONEGRAY(dst, pix)						\
    (&(dst))[0] = pix;							\
    (&(dst))[1] = pix;							\
    (&(dst))[2] = pix;

#define PONEPIX(src,dst) {						\
    l = src;								\
    l |= l << 8; l |= l << 16;						\
    sum = l + uv;							\
    uflo = (l ^ uv) & (l ^ sum) & omask;				\
    if (uflo) {								\
	if ((l = uflo & l) != 0) {					\
	    /* saturate overflow(s) */					\
	    l |= l >> 1;						\
	    l |= l >> 2;						\
	    l |= l >> 4;						\
	    sum |= l;							\
	    uflo &=~ l;							\
	}								\
	if (uflo != 0) {						\
	    /* zero underflow(s) */					\
	    uflo |= uflo >> 1;						\
	    uflo |= uflo >> 2;						\
	    uflo |= uflo >> 4;						\
	    sum &=~ uflo;						\
	}								\
    }									\
    PONERGB(dst, sum & pmask);						\
}

void TrueWindowRenderer24::map_422(const u_char* frm,
                                   u_int off, u_int /* x */,
                                   u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register char* xip = (char*)pixbuf_ + 3*off;
	register int w = width;

#if MMX_CSCONV_ENABLED

	fprintf(stderr, "using 24 map_422\n");
	picsize = w*height;
	pstride = iw - w;
	cstride = pstride >> 1;
	pstride2 = pstride*3;
	segwidth = w;
	segwidthconst = segwidth;
	map_422_asm(yp, up, vp, xip);

#else
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

#define PTWO422(n) \
		uv = yuv2rgb[(up[(n)/2] << 8) | vp[(n)/2]]; \
		PONEPIX(yp[(n)], xip[3*(n)]) \
		PONEPIX(yp[(n)+1], xip[3*(n)+3])

		PTWO422(0)
		PTWO422(2)
		PTWO422(4)
		PTWO422(6)

		xip += 24;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = iw - w;
			register int cstride = pstride >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += 3*pstride;
		}
	}
#endif
}

void TrueWindowRenderer24::map_down2_422(const u_char* frm,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register char* xip = (char*)pixbuf_ + 3*(((off - x) >> 2) + (x >> 1));
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> 1; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

#define PONE422(n) \
		uv = yuv2rgb[(up[(n)/2] << 8) | vp[(n)/2]]; \
		PONEPIX(yp[(n)], xip[3*(n)/2])

		PONE422(0)
		PONE422(2)
		PONE422(4)
		PONE422(6)

		xip += 12;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			register int cstride = pstride >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += 3*((iw - w) >> 1);
		}
	}
}


void TrueWindowRenderer24::map_down4_422(const u_char* frm,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register char* xip = (char*)pixbuf_ + 3*(((off - x) >> 4) + (x >> 2));
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> 2; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		PONEPIX(yp[0], xip[0])
		uv = yuv2rgb[(up[2] << 8) | vp[2]];
		PONEPIX(yp[4], xip[3])

		xip += 6;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 4 * iw - w;
			register int cstride = pstride >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += 3*((iw - w) >> 2);
		}
	}
}

/*
 * decimate by some power of 2 >= 2^3.
 */
void TrueWindowRenderer24::map_down_422(const u_char* frm,
				      u_int off, u_int x,
				      u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register int s = scale_;
	register int istride = 1 << s;
	register char* xip = (char*)pixbuf_ +
	    3*(((off - x) >> (s + s)) + (x >> s));
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> s; len > 0; len -= istride) {
		register u_int l, uv;
		register u_int uflo, sum;

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		PONEPIX(yp[0], xip[0])

		xip += 3;
		yp += istride;
		up += istride >> 1;
		vp += istride >> 1;

		w -= istride;
		if (w <= 0) {
			w = width;
			register int pstride = (iw << s) - w;
			register int cstride = pstride >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += 3*((iw - w) >> s);
		}
	}
}

void TrueWindowRenderer24::map_up2_422(const u_char* frm,
				     u_int off, u_int x,
				     u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + (off >> 1);
	register const u_char* vp = up + (framesize_ >> 1);
	register char* xip = (char*)pixbuf_ + 3*(((off - x) << 2) + (x << 1));
	register int w = width;
	register u_int e1 = yp[0];
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height; len > 0; len -= 2) {
		register u_int l, uv;
		register u_int uflo, sum;
		register u_int e2;
		register char* xip2 = xip + 3*((iw << 1));

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		e2 = yp[0];
		PONEPIX((e1 + e2) >> 1, xip[0])
		PONERGB(xip2[0], sum);
		PONEPIX(e2, xip[3])
		PONERGB(xip2[3], sum);
		e1 = yp[1];
		PONEPIX((e1 + e2) >> 1, xip[6])
		PONERGB(xip2[6], sum);
		PONEPIX(e1, xip[9])
		PONERGB(xip2[9], sum);

		xip += 12;
		yp += 2;
		up += 1;
		vp += 1;

		w -= 2;
		if (w <= 0) {
			w = width;
			register u_int pstride = iw - w;
			register u_int cstride = pstride >> 1;
			yp += pstride;
			e1 = yp[0];
			up += cstride;
			vp += cstride;
			xip += 3*((iw + pstride) << 1);
		}
	}
}

void TrueWindowRenderer24::map_411(const u_char* frm, u_int off,
				 u_int x, u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	register char* xip = (char*)pixbuf_ + 3*off;
	register int w = width;

#if MMX_CSCONV_ENABLED

	fprintf(stderr, "using 24 map_411\n");
	pstride = 2 * iw - w;
	cstride = (iw - w) >> 1;
	pstride2 = 3*pstride;
	picsize = w*height;
	segwidth = w;
	segwidthconst = segwidth;
	map_411_asm(yp, up, vp, xip, iw);

#else
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;
		register char* xip2 = xip + 3*iw;
		register const u_char* yp2 = yp + iw;

#define PFOUR411(n) \
		uv = yuv2rgb[(up[(n)/2] << 8) | vp[(n)/2]]; \
		PONEPIX(yp[(n)], xip[3*(n)]) \
		PONEPIX(yp[(n)+1], xip[3*(n)+3]) \
		PONEPIX(yp2[(n)], xip2[3*(n)]) \
		PONEPIX(yp2[(n)+1], xip2[3*(n)+3])

		PFOUR411(0)
		PFOUR411(2)

		xip += 12;
		yp += 4;
		up += 2;
		vp += 2;

		w -= 4;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			register int cstride = (iw - w) >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += 3*pstride;
		}
	}
#endif
}

#ifdef notdef
void TrueWindowRenderer24::map_411_16(const u_char* frm, u_int off,
				    u_int x, u_int width, u_int height) const
{
#ifdef FIX_THIS
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	/*FIXME use u_int here*/
	register u_short* xip = (u_short*)pixbuf_ + off;
	register int w = width;
	register const u_char* yuv2rgb = (u_char*)cm_.yuv2rgb();
	for (register int len = width * height; len > 0; len -= 16) {
		register u_int u = *(const u_int*)up;
		register u_int v = *(const u_int*)vp;
		register u_int y1, y2;
		register const u_char* y2r;

		y1 = (u & 0xf800f800) | ((v & 0xf800f800) >> 5);
		v = ((u & 0x00f800f8) << 5) | (v & 0x00f800f8);
		u = y1;

		y1 = *(const u_int*)yp;
		y2r = yuv2rgb + UV0;
		xip[0] =  *(u_int*)(y2r + ((y1 >> SHIFT_24) & 0xfc));
		xip[1] =  *(u_int*)(y2r + ((y1 >> SHIFT_16) & 0xfc));
		y2 = *(const u_int*)(yp + iw);
		register u_short* xip2 = xip + iw;
		xip2[0] = *(u_int*)(y2r + ((y2 >> SHIFT_24) & 0xfc));
		xip2[1] = *(u_int*)(y2r + ((y2 >> SHIFT_16) & 0xfc));

		y2r = yuv2rgb + UV1;
		xip[2] =  *(u_int*)(y2r + ((y1 >> SHIFT_8) & 0xfc));
		xip[3] =  *(u_int*)(y2r + ((y1 >> SHIFT_0) & 0xfc));
		xip2[2] = *(u_int*)(y2r + ((y2 >> SHIFT_8) & 0xfc));
		xip2[3] = *(u_int*)(y2r + ((y2 >> SHIFT_0) & 0xfc));

		y1 = *(const u_int*)(yp + 4);
		y2r = yuv2rgb + UV2;
		xip[4] =  *(u_int*)(y2r + ((y1 >> SHIFT_24) & 0xfc));
		xip[5] =  *(u_int*)(y2r + ((y1 >> SHIFT_16) & 0xfc));
		y2 = *(const u_int*)(yp + iw + 4);
		xip2[4] = *(u_int*)(y2r + ((y2 >> SHIFT_24) & 0xfc));
		xip2[5] = *(u_int*)(y2r + ((y2 >> SHIFT_16) & 0xfc));

		y2r = yuv2rgb + UV3;
		xip[6] =  *(u_int*)(y2r + ((y1 >> SHIFT_8) & 0xfc));
		xip[7] =  *(u_int*)(y2r + ((y1 >> SHIFT_0) & 0xfc));
		xip2[6] = *(u_int*)(y2r + ((y2 >> SHIFT_8) & 0xfc));
		xip2[7] = *(u_int*)(y2r + ((y2 >> SHIFT_0) & 0xfc));

		xip += 8;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			register int cstride = (iw - w) >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += pstride;
		}
	}
#endif
}
#endif

void TrueWindowRenderer24::map_down2_411(const u_char* frm,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	off = ((off - x) >> 2) + (x >> 1);
	register const u_char* up = frm + framesize_ + off;
	register const u_char* vp = up + (framesize_ >> 2);
	register char* xip = (char*)pixbuf_ + 3*off;
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> 1; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

#define PONE411(n) \
		uv = yuv2rgb[(up[(n)/2] << 8) | vp[(n)/2]]; \
		PONEPIX(yp[(n)], xip[3*(n)/2])

		PONE411(0)
		PONE411(2)
		PONE411(4)
		PONE411(6)

		xip += 12;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			register int cstride = (iw - w) >> 1;
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += 3*cstride;
		}
	}
}

void TrueWindowRenderer24::map_down4_411(const u_char* frm,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	register char* xip = (char*)pixbuf_ + 3*(((off - x) >> 4) + (x >> 2));
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> 2; len > 0; len -= 8) {
		register u_int l, uv;
		register u_int uflo, sum;

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		PONEPIX(yp[0], xip[0])
		uv = yuv2rgb[(up[2] << 8) | vp[2]];
		PONEPIX(yp[4], xip[3])

		xip += 6;
		yp += 8;
		up += 4;
		vp += 4;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 4 * iw - w;
			register int cstride = iw - (w >> 1);
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += 3*((iw - w) >> 2);
		}
	}
}

/*
 * decimate by some power of 2 >= 2^3.
 */
void TrueWindowRenderer24::map_down_411(const u_char* frm,
				      u_int off, u_int x,
				      u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	register int s = scale_;
	register int istride = 1 << s;
	register char* xip = (char*)pixbuf_
		+ 3*(((off - x) >> (s + s)) + (x >> s));
	register int w = width;
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height >> s; len > 0; len -= istride) {
		register u_int l, uv;
		register u_int uflo, sum;

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		PONEPIX(yp[0], xip[0])

		yp += istride;
		up += istride >> 1;
		vp += istride >> 1;

		w -= istride;
		if (w <= 0) {
			w = width;
			register int pstride = (iw << s) - w;
			register int cstride = (iw << (s - 1)) - (w >> 1);
			yp += pstride;
			up += cstride;
			vp += cstride;
			xip += 3*((iw - w) >> s);
		}
	}
}

void TrueWindowRenderer24::map_up2_411(const u_char* frm,
				     u_int off, u_int x,
				     u_int width, u_int height) const
{
	register u_int iw = width_;
	register const u_char* yp = frm + off;
	register const u_char* up = frm + framesize_ + ((off - x) >> 2) + (x >> 1);
	register const u_char* vp = up + (framesize_ >> 2);
	register char* xip = (char*)pixbuf_ + 3*(((off - x) << 2) + (x << 1));
	register int w = width;
	register u_int e1 = yp[0], o1 = yp[iw];
	register const u_int* yuv2rgb = cm_.uvtab();
	register u_int omask = cm_.omask();
	register u_int pmask = cm_.pmask();

	for (register int len = w * height; len > 0; len -= 4) {
		register u_int l, uv;
		register u_int uflo, sum;
		register u_int e2, o2;
		register const u_char* yp2 = yp + iw;
		register char* xip2 = xip + 3*((iw << 1));
		register char* xip3 = xip2 + 3*((iw << 1));
		register char* xip4 = xip3 + 3*((iw << 1));

		uv = yuv2rgb[(up[0] << 8) | vp[0]];
		e2 = yp[0];
		PONEPIX((e1 + e2) >> 1, xip[0])
		PONERGB(xip2[0], sum);
		PONEPIX(e2, xip[3])
		PONERGB(xip2[3], sum);
		e1 = yp[1];
		PONEPIX((e1 + e2) >> 1, xip[6])
		PONERGB(xip2[6], sum);
		PONEPIX(e1, xip[9])
		PONERGB(xip2[9], sum);

		o2 = yp2[0];
		PONEPIX((o1 + o2) >> 1, xip3[0])
		PONERGB(xip4[0], sum);
		PONEPIX(o2, xip3[3])
		PONERGB(xip4[3], sum);
		o1 = yp2[1];
		PONEPIX((o1 + o2) >> 1, xip3[6])
		PONERGB(xip4[6], sum);
		PONEPIX(o1, xip3[9])
		PONERGB(xip4[9], sum);


		xip += 12;
		yp += 2;
		up += 1;
		vp += 1;

		w -= 2;
		if (w <= 0) {
			w = width;
			register u_int pstride = 2 * iw - w;
			register u_int cstride = (iw - w) >> 1;
			yp += pstride;
			e1 = yp[0];
			o1 = yp[iw];
			up += cstride;
			vp += cstride;
			xip += 3 * (8 * iw - 2 * w);
		}
	}
}

void TrueWindowRenderer24::map_gray(register const u_char *yp,
				  u_int off, u_int /* x */,
				  u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	register char* xip = (char*)pixbuf_ + 3*off;
	register int w = width;
	for (register int len = w * height; len > 0; len -= 8) {
		register u_int y1;
		register u_int pix;

		y1 = *(const u_int*)yp;
		pix = (y1 >> SHIFT_24) & 0xff;
		PONEGRAY(xip[0], pix);
		pix = (y1 >> SHIFT_16) & 0xff;
		PONEGRAY(xip[3], pix);
		pix = (y1 >> SHIFT_8) & 0xff;
		PONEGRAY(xip[6], pix);
		pix = (y1 >> SHIFT_0) & 0xff;
		PONEGRAY(xip[9], pix);

		y1 = *(const u_int*)(yp + 4);
		pix = (y1 >> SHIFT_24) & 0xff;
		PONEGRAY(xip[12], pix);
		pix = (y1 >> SHIFT_16) & 0xff;
		PONEGRAY(xip[15], pix);
	        pix = (y1 >> SHIFT_8) & 0xff;
		PONEGRAY(xip[18], pix);
		pix = (y1 >> SHIFT_0) & 0xff;
		PONEGRAY(xip[21], pix);

		xip += 24;
		yp += 8;

		w -= 8;
		if (w <= 0) {
			w = width;
			register u_int pstride = iw - w;
			yp += pstride;
			xip += 3*pstride;
		}
	}
}

void TrueWindowRenderer24::map_gray_down2(register const u_char *yp,
					u_int off, u_int x,
					u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	off = ((off - x) >> 2) + (x >> 1);
	register char* xip = (char*)pixbuf_ + 3*off;
	register int w = width;
	for (register int len = w * height >> 1; len > 0; len -= 8) {
		register u_int y1;
		register u_int pix;

		y1 = *(const u_int*)yp;
		pix = (y1 >> SHIFT_24) & 0xff;
		PONEGRAY(xip[0], pix);
		pix = (y1 >> SHIFT_8) & 0xff;
		PONEGRAY(xip[3], pix);

		y1 = *(const u_int*)(yp + 4);
		pix = (y1 >> SHIFT_24) & 0xff;
		PONEGRAY(xip[6], pix);
		pix = (y1 >> SHIFT_8) & 0xff;
		PONEGRAY(xip[9], pix);

		xip += 12;
		yp += 8;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 2 * iw - w;
			yp += pstride;
			xip += 3*((iw - w) >> 1);
		}
	}
}

void TrueWindowRenderer24::map_gray_down4(register const u_char *yp,
					u_int off, u_int x,
					u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	register char* xip = (char*)pixbuf_ + 3*(((off - x) >> 4) + (x >> 2));
	register int w = width;
	for (register int len = w * height >> 2; len > 0; len -= 8) {
		register u_int pix;

		pix = yp[0];
		PONEGRAY(xip[0], pix);
		pix = yp[4];
		PONEGRAY(xip[3], pix);

		xip += 6;
		yp += 8;

		w -= 8;
		if (w <= 0) {
			w = width;
			register int pstride = 4 * iw - w;
			yp += pstride;
			xip += 3*((iw - w) >> 2);
		}
	}
}

void TrueWindowRenderer24::map_gray_down(register const u_char *yp,
				       u_int off, u_int x,
				       u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	register int s = scale_;
	register int istride = 1 << s;
	register char* xip = (char*)pixbuf_ +
		3*(((off - x) >> (s + s)) + (x >> s));
	register int w = width;
	for (register int len = w * height >> s; len > 0; len -= istride) {
		register u_int pix = *yp;
		PONEGRAY(xip[0], pix);
		xip += 3;
		yp += istride;
		w -= istride;
		if (w <= 0) {
			w = width;
			register int pstride = (iw << s) - w;
			yp += pstride;
			xip += 3*((iw - w) >> s);
		}
	}
}

void TrueWindowRenderer24::map_gray_up2(register const u_char *yp,
				      u_int off, u_int x,
				      u_int width, u_int height) const
{
	register u_int iw = width_;
	yp += off;
	register char* xip = (char*)pixbuf_ + 3*(((off - x) << 2) + (x << 1));
	register int w = width;
	register u_int e1 = yp[0];

	for (register int len = width * height; len > 0; len -= 8) {
		register u_int y1, e2, pix;
		register char* xip2 = xip + 3*iw * 2;

		y1 = *(const u_int*)yp;
		e2 = (y1 >> SHIFT_24) & 0xff;
		pix = (e1 + e2) >> 1;
		PONEGRAY(xip[0], pix);
		PONEGRAY(xip2[0], pix);
		PONEGRAY(xip[3], e2);
		PONEGRAY(xip2[3], e2);
		e1 = (y1 >> SHIFT_16) & 0xff;
		pix = (e1 + e2) >> 1;
		PONEGRAY(xip[6], pix);
		PONEGRAY(xip2[6], pix);
		PONEGRAY(xip[9], e1);
		PONEGRAY(xip2[9], e1);

		e2 = (y1 >> SHIFT_8) & 0xff;
		pix = (e1 + e2) >> 1;
		PONEGRAY(xip[12], pix);
		PONEGRAY(xip2[12], pix);
		PONEGRAY(xip[15], e2);
		PONEGRAY(xip2[15], e2);
		e1 = (y1 >> SHIFT_0) & 0xff;
		pix = (e1 + e2) >> 1;
		PONEGRAY(xip[18], pix);
		PONEGRAY(xip2[18], pix);
		PONEGRAY(xip[21], e1);
		PONEGRAY(xip2[21], e1);

		y1 = *(const u_int*)(yp + 4);
		e2 = (y1 >> SHIFT_24) & 0xff;
		pix = (e1 + e2) >> 1;
		PONEGRAY(xip[24], pix);
		PONEGRAY(xip2[24], pix);
		PONEGRAY(xip[27], e2);
		PONEGRAY(xip2[27], e2);
		e1 = (y1 >> SHIFT_16) & 0xff;
		pix = (e1 + e2) >> 1;
		PONEGRAY(xip[30], pix);
		PONEGRAY(xip2[30], pix);
		PONEGRAY(xip[33], e1);
		PONEGRAY(xip2[33], e1);

		e2 = (y1 >> SHIFT_8) & 0xff;
		pix = (e1 + e2) >> 1;
		PONEGRAY(xip[36], pix);
		PONEGRAY(xip2[36], pix);
		PONEGRAY(xip[39], e2);
		PONEGRAY(xip2[39], e2);
		e1 = (y1 >> SHIFT_0) & 0xff;
		pix = (e1 + e2) >> 1;
		PONEGRAY(xip[42], pix);
		PONEGRAY(xip2[42], pix);
		PONEGRAY(xip[45], e1);
		PONEGRAY(xip2[45], e1);

		xip += 48;
		yp += 8;

		w -= 8;
		if (w <= 0) {
			w = width;
			register u_int pstride = iw - w;
			yp += pstride;
			e1 = yp[0];
			xip += 3*((iw + pstride) << 1);
		}
	}
}

/* The following 4 functions contain assembly language implementations of the
 * yuv->rgb conversion functions.  If you suspect bugs in these functions,
 * e-mail amahesri@uclink4.berkeley.edu.
 */
#if MMX_CSCONV_ENABLED
inline void TrueWindowRenderer32::map_422_asm(const u_char* yp, const u_char* up,
					      const u_char* vp, RGBPointer xip) const
{

	__asm__ __volatile__(
"# Y pointer -> %%esi\n"
"# Cr pointer -> %%ebx\n"
"# Cb pointer -> %%ecx\n"
"# outptr -> %%edi\n"

"do_next8_1:\n"
    "movd (%%ebx),%%mm0 # 0 0 0 0 Cr3 Cr2 Cr1 Cr0\n"
    "pxor %%mm6,%%mm6\n"
    "punpcklbw %%mm0,%%mm0 # Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0\n"
    "movq const128,%%mm7\n"
    "punpcklwd %%mm0,%%mm0 # Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0\n"
    "movq %%mm0,%%mm4\n"
    "punpcklbw %%mm6,%%mm0 # Cr0 Cr0 Cr0 Cr0\n"
    "psubsw %%mm7,%%mm0 # Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128\n"
    "movd (%%ecx),%%mm1 # 0 0 0 0 Cb3 Cb2 Cb1 Cb0\n"
    "psllw $2,%%mm0 # left shift by 2 bits\n"
    "punpcklbw %%mm1,%%mm1 # Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0\n"
    "paddsw const05,%%mm0 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm1,%%mm1 # Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0\n"
    "movq %%mm1,%%mm5\n"
    "pmulhw const1,%%mm0 # multiply by (fix(x) >> 1)\n"
    "punpcklbw %%mm6,%%mm1 # Cb0 Cb0 Cb0 Cb0\n"
    "punpckhbw %%mm6,%%mm4 # Cr1 Cr1 Cr1 Cr1\n"
    "psubsw %%mm7,%%mm1 # Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128\n"
    "punpckhbw %%mm6,%%mm5 # Cb1 Cb1 Cb1 Cb1\n"
    "psllw $2,%%mm1 # left shift by 2 bits\n"
    "paddsw const15,%%mm1 # add (one_half/fix(x)) << 2\n"
    "psubsw %%mm7,%%mm4 # Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128\n"
    "psubsw %%mm7,%%mm5 # Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128\n"
    "pmulhw const2,%%mm1 # multiply by (fix(x) >> 1)\n"
    "psllw $2,%%mm4 # left shift by 2 bits\n"
    "psllw $2,%%mm5 # left shift by 2 bits\n"
    "paddsw const45,%%mm4 # add (one_half/fix(x)) << 2\n"
    "movd (%%esi),%%mm7 # Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0\n"
    "pmulhw const5,%%mm4 # multiply by (fix(x) >> 1)\n"
    "movq %%mm7,%%mm6\n"
    "punpcklbw %%mm7,%%mm7 # Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0\n"
    "paddsw const55,%%mm5 # add (one_half/fix(x)) << 2\n"
    "paddsw %%mm1,%%mm0 # cred0 cbl0 cgr0 cred0\n"
    "movq %%mm7,%%mm1\n"
    "pmulhw const6,%%mm5 # multiply by (fix(x) >> 1)\n"
    "movq %%mm0,%%mm2 # cred0 cbl0 cgr0 cred0\n"
    "punpcklwd %%mm6,%%mm7 # Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0\n"
    "pand davemask,%%mm2 # 0 cbl0 cgr0 0\n"
    "psrlq $16,%%mm1 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "psrlq $16,%%mm2 # 0 0 cbl0 cgr0\n"
    "punpcklbw empty,%%mm7 # Y1 Y0 Y0 Y0\n"
    "paddsw %%mm5,%%mm4 # cbl1 cgr1 cred1 cbl1\n"
    "movq %%mm4,%%mm3 # cbl1 cgr1 cred1 cbl1\n"
    "pand davemask,%%mm3 # 0 cgr1 cred1 0\n"
    "paddsw %%mm0,%%mm7 # r1 b0 g0 r0\n"
    "psllq $16,%%mm3 # cgr1 cred1 0 0\n"
    "movq %%mm1,%%mm6 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "por %%mm3,%%mm2 # cgr1 cred1 cbl0 cgr0\n"
    "punpcklbw empty,%%mm6 # Y4 Y4 Y1 Y1\n"
    "paddsw %%mm2,%%mm6 # g4 r4 b1 g1\n"
    "packuswb %%mm6,%%mm7 # g4 r4 b1 g1 r1 b0 g0 r0\n"
    "movd %%mm7,%%eax # r1 b0 g0 r0\n"
    "andl $0xffffff,%%eax # 0 b0 g0 r0\n"
    "movl %%eax,(%%edi) # move to memory b0 g0 r0\n"
    "psrlq $24,%%mm7 # 0 0 0 g4 r4 b1 g1 r1\n"
    "movd %%mm7,%%eax # r4 b1 g1 r1\n"
    "andl $0xffffff,%%eax # 0 b1 g1 r1\n"
    "movl %%eax,4(%%edi) # move to memory b1 g1 r1\n"
    "psrlq $24,%%mm7 # 0 0 0 0 0 0 g4 r4\n"
    "movd %%mm7,%%edx # 0 0 g4 r4\n"
"# move to memory g4 r4 b1 g1 r1 b0 g0 r0\n"
    "movq %%mm1,%%mm0 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "psrlq $24,%%mm1 # 0 0 0 0 0 Y5 Y5 Y4\n"
    "psrlq $32,%%mm0 # 0 0 0 0 0 0 Y5 Y5\n"
    "punpcklwd %%mm0,%%mm1 # X X X X Y5 Y5 Y5 Y4\n"
    "movd (%%ebx),%%mm0 # 0 0 0 0 Cr5 Cr4 Cr3 Cr2\n"
    "psrlq $16,%%mm0\n"
    "punpcklbw empty,%%mm1 # Y5 Y5 Y5 Y4\n"
    "paddsw %%mm4,%%mm1 # b5 g5 r5 b4\n"
    "pxor %%mm6,%%mm6 # clear mm6 registr\n"
    "punpcklbw %%mm0,%%mm0 # X X X X Cr3 Cr3 Cr2 Cr2\n"
    "punpcklwd %%mm0,%%mm0 # Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2\n"
    "movq %%mm0,%%mm4\n"
    "movd (%%ecx),%%mm3 # 0 0 0 0 Cb5 Cb4 Cb3 Cb2\n"
    "punpcklbw %%mm6,%%mm0 # Cr2 Cr2 Cr2 Cr2\n"
    "psrlq $16,%%mm3\n"
    "psubsw const128,%%mm0 # Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Cb3 Cb3 Cb2 Cb2\n"
    "psllw $2,%%mm0 # left shift by 2 bits\n"
    "paddsw const05,%%mm0 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm3,%%mm3 # Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2\n"
    "movq %%mm3,%%mm7\n"
    "pmulhw const1,%%mm0 # multiply by (fix(x) >> 1)\n"
    "punpcklbw %%mm6,%%mm3 # Cb2 Cb2 Cb2 Cb2\n"
    "psubsw const128,%%mm3 # Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128\n"
    "punpckhbw %%mm6,%%mm4 # Cr3 Cr3 Cr3 Cr3\n"
    "psllw $2,%%mm3 # left shift by 2 bits\n"
    "paddsw const15,%%mm3 # add (one_half/fix(x)) << 2\n"
    "punpckhbw %%mm6,%%mm7 # Cb3 Cb3 Cb3 Cb3\n"
    "pmulhw const2,%%mm3 # multiply by (fix(x) >> 1)\n"
    "psubsw const128,%%mm7 # Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128\n"
    "paddsw %%mm3,%%mm0 # cred2 cbl2 cgr2 cred2\n"
    "psllw $2,%%mm7 # left shift by 2 bits\n"
    "psubsw const128,%%mm4 # Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128\n"
    "movd 4(%%esi),%%mm3 # Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8\n"
    "psllw $2,%%mm4 # left shift by 2 bits\n"
    "paddsw const55,%%mm7 # add (one_half/fix(x)) << 2\n"
    "movq %%mm3,%%mm6 # Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8\n"
    "movq %%mm0,%%mm2\n"
    "pand davemask,%%mm2\n"
    "punpcklbw %%mm3,%%mm3 # Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8\n"
    "psrlq $16,%%mm2\n"
    "paddsw const45,%%mm4 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm6,%%mm3 # X X X X Y9 Y8 Y8 Y8\n"
    "pmulhw const5,%%mm4 # multiply by (fix(x) >> 1)\n"
    "pmulhw const6,%%mm7 # multiply by (fix(x) >> 1)\n"
    "punpcklbw empty,%%mm3 # Y9 Y8 Y8 Y8\n"
    "paddsw %%mm7,%%mm4 # cbl3 cgr3 cred3 cbl3\n"
    "paddsw %%mm0,%%mm3 # r9 b8 g8 r8\n"
    "movq %%mm4,%%mm7\n"
    "packuswb %%mm3,%%mm1 # r9 b8 g8 r8 b5 g5 r5 b4\n"
    "pand davemask,%%mm7\n"
    "psrlq $8,%%mm6 # 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9\n"
    "psllq $16,%%mm7\n"
    "movd %%mm1,%%eax # b5 g5 r5 b4\n"
    "andl $0xff,%%eax # b4\n"
    "shll $16,%%eax # 0 b4 0 0\n"
    "orl %%eax,%%edx # 0 b4 g4 r4\n"
    "movl %%edx,8(%%edi) # move to memory b4 g4 r4\n"
    "movd %%mm1,%%eax # b5 g5 r5 b4\n"
    "shrl $8,%%eax # 0 b5 g5 r5\n"
    "movl %%eax,12(%%edi) # move to memory b5 g5 r5\n"
    "psrlq $32,%%mm1 # 0 0 0 0 r9 b8 g8 r8\n"
    "movd %%mm1,%%eax # r9 b8 g8 r8\n"
    "andl $0xffffff,%%eax # 0 b8 g8 r8\n"
    "movl %%eax,16(%%edi) # move to memory b8 g8 r8\n"
    "movd %%mm1,%%edx # r9 b8 g8 r8\n"
    "shrl $24,%%edx # 0 0 0 r9\n"
"# move to memory r9 b8 g8 r8 b5 g5 r5 b4\n"
    "por %%mm7,%%mm2\n"
    "pxor %%mm1,%%mm1\n"
    "movq %%mm6,%%mm3 # 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9\n"
    "punpcklbw %%mm6,%%mm6 # X X X X Y12 Y12 Y9 Y9\n"
    "punpcklbw %%mm1,%%mm6 # Y62212 Y12 Y9 Y9\n"
    "paddsw %%mm2,%%mm6 # g12 r12 b9 g9\n"
    "psrlq $8,%%mm3 # 0 0 Y21 Y20 Y17 Y16 Y13 Y12\n"
    "movq %%mm3,%%mm1 # 0 0 Y21 Y20 Y17 Y16 Y13 Y12\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Y13 Y13 Y12 Y12\n"
    "addl $8,%%esi\n"
    "psrlq $16,%%mm3 # X X X X X X Y13 Y13 modified on 09/24\n"
    "punpcklwd %%mm3,%%mm1 # X X X X Y13 Y13 Y13 Y12\n"
    "punpcklbw empty,%%mm1 # Y13 Y13 Y13 Y12\n"
    "paddsw %%mm4,%%mm1 # b13 g13 r13 b12\n"
    "addl $32,%%edi\n"
    "punpcklwd %%mm0,%%mm5 # X X X X Y15 Y15 Y15 Y14\n"
    "packuswb %%mm1,%%mm6 # b13 g13 r13 b12 g12 r12 b9 g9\n"
    "punpcklbw empty,%%mm5 # Y15 Y15 Y15 Y14\n"
    "addl $4,%%ebx\n"
    "movd %%mm6,%%eax # g12 r12 b9 g9\n"
    "andl $0xffff,%%eax # 0 0 b9 g9\n"
    "shll $8,%%eax # 0 b9 g9 0\n"
    "orl %%edx,%%eax # 0 b9 g9 r9\n"
    "movl %%eax,-12(%%edi) # move to memory b9 g9 r9\n"
    "psrlq $16,%%mm6 # 0 0 b13 g13 r13 b12 g12 r12\n"
    "movd %%mm6,%%eax # r13 b12 g12 r12\n"
    "andl $0xffffff,%%eax # 0 b12 g12 r12\n"
    "movl %%eax,-8(%%edi) # move to memory b12 g12 r12\n"
    "psrlq $24,%%mm6 # 0 0 0 0 0 b13 g13 r13\n"
    "movd %%mm6,%%eax # 0 b13 g13 r13\n"
    "movl %%eax,-4(%%edi) # move to memeory b13 g13 r13\n"
"# move to memory b13 g13 r13 b12 g12 r12 b9 g9\n"
    "addl $4,%%ecx\n"
    "addl $-8,segwidth\n"
    "jz end_of_line8_1\n"
"ltest8_1:\n"
    "addl $-8,picsize # cols_asm should be replaced by a register that has the num of cols\n"
    "testl $0xffffffff,picsize # sets flag to 0 if edx is 0\n"
    "jnz do_next8_1 # if cols_asm is not zero, loop\n"
    "jmp end8_1\n"
"end_of_line8_1:\n"
    "addl pstride,%%esi # yp += pstride\n"
    "addl cstride,%%ebx # up += cstride\n"
    "addl cstride,%%ecx # vp += cstride\n"
    "addl pstride2,%%edi # xip += pstride\n"
    "movl segwidthconst,%%eax\n"
    "movl %%eax,segwidth\n"
    "jmp ltest8_1\n"
"end8_1:\n"
    "emms\n"
	  : "=S" (yp), "=b" (up), "=c" (vp), "=D" (xip)
	  : "S" (yp), "b" (up), "c" (vp), "D" (xip)
	  : "memory", "%edx", "%eax"
	  );

}

inline void TrueWindowRenderer32::map_411_asm(const u_char* yp, const u_char* up,
					      const u_char* vp, RGBPointer xip,
					      u_int iw) const {
  RGBPointer xip2 = xip + iw;
  const u_char* yp2 = yp + iw;

	__asm__ __volatile__(
"# Y pointer 1 -> %%esi\n"
"# Y pointer 2 -> %%eax\n"
"# Cr pointer -> %%ebx\n"
"# Cb pointer -> %%ecx\n"
"# outptr 1 -> %%edi\n"
"# outptr 2 -> %%edx\n"

"do_next16_1:\n"
    "movd (%%ebx),%%mm0 # 0 0 0 0 Cr3 Cr2 Cr1 Cr0\n"
    "pxor %%mm6,%%mm6\n"
    "punpcklbw %%mm0,%%mm0 # Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0\n"
    "movq const128,%%mm7\n"
    "punpcklwd %%mm0,%%mm0 # Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0\n"
    "movq %%mm0,%%mm4\n"
    "punpcklbw %%mm6,%%mm0 # Cr0 Cr0 Cr0 Cr0\n"
    "psubsw %%mm7,%%mm0 # Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128\n"
    "movd (%%ecx),%%mm1 # 0 0 0 0 Cb3 Cb2 Cb1 Cb0\n"
    "psllw $2,%%mm0 # left shift by 2 bits\n"
    "punpcklbw %%mm1,%%mm1 # Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0\n"
    "paddsw const05,%%mm0 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm1,%%mm1 # Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0\n"
    "movq %%mm1,%%mm5\n"
    "pmulhw const1,%%mm0 # multiply by (fix(x) >> 1)\n"
    "punpcklbw %%mm6,%%mm1 # Cb0 Cb0 Cb0 Cb0\n"
    "punpckhbw %%mm6,%%mm4 # Cr1 Cr1 Cr1 Cr1\n"
    "psubsw %%mm7,%%mm1 # Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128\n"
    "punpckhbw %%mm6,%%mm5 # Cb1 Cb1 Cb1 Cb1\n"
    "psllw $2,%%mm1 # left shift by 2 bits\n"
    "paddsw const15,%%mm1 # add (one_half/fix(x)) << 2\n"
    "psubsw %%mm7,%%mm4 # Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128\n"
    "psubsw %%mm7,%%mm5 # Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128\n"
    "pmulhw const2,%%mm1 # multiply by (fix(x) >> 1)\n"
    "psllw $2,%%mm4 # left shift by 2 bits\n"
    "psllw $2,%%mm5 # left shift by 2 bits\n"
    "paddsw const45,%%mm4 # add (one_half/fix(x)) << 2\n"
    "movd (%%esi),%%mm7 # Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0\n"
    "pmulhw const5,%%mm4 # multiply by (fix(x) >> 1)\n"
    "movq %%mm7,%%mm6\n"
    "punpcklbw %%mm7,%%mm7 # Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0\n"
    "paddsw const55,%%mm5 # add (one_half/fix(x)) << 2\n"
    "paddsw %%mm1,%%mm0 # cred0 cbl0 cgr0 cred0\n"
    "movq %%mm7,%%mm1\n"
    "pmulhw const6,%%mm5 # multiply by (fix(x) >> 1)\n"
    "movq %%mm0,%%mm2 # cred0 cbl0 cgr0 cred0\n"
    "punpcklwd %%mm6,%%mm7 # Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0\n"
    "pand davemask,%%mm2 # 0 cbl0 cgr0 0\n"
    "psrlq $16,%%mm1 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "psrlq $16,%%mm2 # 0 0 cbl0 cgr0\n"
    "punpcklbw empty,%%mm7 # Y1 Y0 Y0 Y0\n"
    "paddsw %%mm5,%%mm4 # cbl1 cgr1 cred1 cbl1\n"
    "movq %%mm4,%%mm3 # cbl1 cgr1 cred1 cbl1\n"
    "pand davemask,%%mm3 # 0 cgr1 cred1 0\n"
    "paddsw %%mm0,%%mm7 # r1 b0 g0 r0\n"
    "psllq $16,%%mm3 # cgr1 cred1 0 0\n"
    "movq %%mm1,%%mm6 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "por %%mm3,%%mm2 # cgr1 cred1 cbl0 cgr0\n"
    "punpcklbw empty,%%mm6 # Y4 Y4 Y1 Y1\n"
    "movd (%%eax),%%mm3 # Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2\n"
    "paddsw %%mm2,%%mm6 # g4 r4 b1 g1\n"
    "packuswb %%mm6,%%mm7 # g4 r4 b1 g1 r1 b0 g0 r0\n"
    "movq %%mm3,%%mm6 # Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2\n"
    "punpcklbw %%mm3,%%mm3 # Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2\n"
"# move to memory g4 r4 b1 g1 r1 b0 g0 r0\n"
    "movd %%mm7,(%%edi) # move to memory b0 g0 r0\n"
    "psrlq $24,%%mm7 # 0 0 0 g4 r4 b1 g1 r1 b0 g0 r0\n"
    "movd %%mm7,4(%%edi) # move to memory b0 g0 r0\n"
    "psrlq $24,%%mm7 # 0 0 0 0 0 0 g4 r4\n"
    "movd %%mm7,8(%%edi) # move to memory g4 r4\n"
    "movq %%mm3,%%mm5 # Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2\n"
    "punpcklwd %%mm6,%%mm3 # X X X X Y3 Y2 Y2 Y2\n"
    "punpcklbw empty,%%mm3 # Y3 Y2 Y2 Y2\n"
    "psrlq $16,%%mm5 # 0 0 Y7 Y7 Y6 Y6 Y3 Y3\n"
    "paddsw %%mm0,%%mm3 # r3 b2 g2 r2\n"
    "movq %%mm5,%%mm6 # 0 0 Y7 Y7 Y6 Y6 Y3 Y3\n"
    "movq %%mm1,%%mm0 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "punpckldq %%mm6,%%mm6 # X X X X Y6 Y6 Y3 Y3\n"
    "punpcklbw empty,%%mm6 # Y6 Y6 Y3 Y3\n"
    "psrlq $24,%%mm1 # 0 0 0 0 0 Y5 Y5 Y4\n"
    "paddsw %%mm2,%%mm6 # g6 r6 b3 g3\n"
    "packuswb %%mm6,%%mm3 # g6 r6 b3 g3 r3 b2 g2 r2\n"
    "movq %%mm5,%%mm2 # 0 0 Y7 Y7 Y6 Y6 Y3 Y3\n"
    "psrlq $32,%%mm0 # 0 0 0 0 0 0 Y5 Y5\n"
"# move to memory g6 r6 b3 g3 r3 b2 g2 r2\n"
    "movd %%mm3,(%%edx) # move to memory b2 g2 r2\n"
    "psrlq $24,%%mm3 # 0 0 0 g6 r6 b3 g3 r3\n"
    "movd %%mm3,4(%%edx) # move to memory b3 g3 r3\n"
    "psrlq $24,%%mm3 # 0 0 0 0 0 0 g6 r6\n"
    "movd %%mm3,8(%%edx) # move to memory g6 r6\n"
    "punpcklwd %%mm0,%%mm1 # X X X X Y5 Y5 Y5 Y4\n"
    "psrlq $24,%%mm5 # 0 0 0 0 0 Y7 Y7 Y6\n"
    "movd (%%ebx),%%mm0 # 0 0 0 0 Cr5 Cr4 Cr3 Cr2\n"
    "psrlq $32,%%mm2 # 0 0 0 0 0 0 Y7 Y7\n"
    "psrlq $16,%%mm0\n"
    "punpcklbw empty,%%mm1 # Y5 Y5 Y5 Y4\n"
    "punpcklwd %%mm2,%%mm5 # X X X X Y7 Y7 Y7 Y6\n"
    "paddsw %%mm4,%%mm1 # b5 g5 r5 b4\n"
    "punpcklbw empty,%%mm5 # Y7 Y7 Y7 Y6\n"
    "pxor %%mm6,%%mm6 # clear mm6 registr\n"
    "punpcklbw %%mm0,%%mm0 # X X X X Cr3 Cr3 Cr2 Cr2\n"
    "paddsw %%mm4,%%mm5 # b7 g7 r7 b6\n"
    "punpcklwd %%mm0,%%mm0 # Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2\n"
    "movq %%mm0,%%mm4\n"
    "movd (%%ecx),%%mm3 # 0 0 0 0 Cb5 Cb4 Cb3 Cb2\n"
    "punpcklbw %%mm6,%%mm0 # Cr2 Cr2 Cr2 Cr2\n"
    "psrlq $16,%%mm3\n"
    "psubsw const128,%%mm0 # Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Cb3 Cb3 Cb2 Cb2\n"
    "psllw $2,%%mm0 # left shift by 2 bits\n"
    "paddsw const05,%%mm0 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm3,%%mm3 # Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2\n"
    "movq %%mm3,%%mm7\n"
    "pmulhw const1,%%mm0 # multiply by (fix(x) >> 1)\n"
    "punpcklbw %%mm6,%%mm3 # Cb2 Cb2 Cb2 Cb2\n"
    "psubsw const128,%%mm3 # Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128\n"
    "punpckhbw %%mm6,%%mm4 # Cr3 Cr3 Cr3 Cr3\n"
    "psllw $2,%%mm3 # left shift by 2 bits\n"
    "paddsw const15,%%mm3 # add (one_half/fix(x)) << 2\n"
    "punpckhbw %%mm6,%%mm7 # Cb3 Cb3 Cb3 Cb3\n"
    "pmulhw const2,%%mm3 # multiply by (fix(x) >> 1)\n"
    "psubsw const128,%%mm7 # Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128\n"
    "paddsw %%mm3,%%mm0 # cred2 cbl2 cgr2 cred2\n"
    "psllw $2,%%mm7 # left shift by 2 bits\n"
    "psubsw const128,%%mm4 # Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128\n"
    "movd 4(%%esi),%%mm3 # Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8\n"
    "psllw $2,%%mm4 # left shift by 2 bits\n"
    "paddsw const55,%%mm7 # add (one_half/fix(x)) << 2\n"
    "movq %%mm3,%%mm6 # Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8\n"
    "movq %%mm0,%%mm2\n"
    "pand davemask,%%mm2\n"
    "punpcklbw %%mm3,%%mm3 # Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8\n"
    "psrlq $16,%%mm2\n"
    "paddsw const45,%%mm4 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm6,%%mm3 # X X X X Y9 Y8 Y8 Y8\n"
    "pmulhw const5,%%mm4 # multiply by (fix(x) >> 1)\n"
    "pmulhw const6,%%mm7 # multiply by (fix(x) >> 1)\n"
    "punpcklbw empty,%%mm3 # Y9 Y8 Y8 Y8\n"
    "paddsw %%mm7,%%mm4 # cbl3 cgr3 cred3 cbl3\n"
    "paddsw %%mm0,%%mm3 # r9 b8 g8 r8\n"
    "movq %%mm4,%%mm7\n"
    "packuswb %%mm3,%%mm1 # r9 b8 g8 r8 b5 g5 r5 b4\n"
    "movd 4(%%eax),%%mm3 # Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10\n"
    "pand davemask,%%mm7\n"
    "psrlq $8,%%mm6 # 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9\n"
    "psllq $16,%%mm7\n"
"# move to memory r9 b8 g8 r8 b5 g5 r5 b4\n"
    "pushl %%ecx\n"
    "movd %%mm1,%%ecx # put onto stack b5 g5 r5 b4\n"
    "shll $16,%%ecx # r5 b4 0 0\n"
    "orl %%ecx,8(%%edi) # or into memory b4\n"
    "popl %%ecx\n"
    "psrlq $8,%%mm1 # 0 r9 b8 g8 r8 b5 g5 r5\n"
    "movd %%mm1,12(%%edi) # move to memory b5 g5 r5\n"
    "psrlq $24,%%mm1 # 0 0 0 0 r9 b8 g8 r8\n"
    "movd %%mm1,16(%%edi) # move to memory b8 g8 r8\n"
    "psrlq $24,%%mm1 # 0 0 0 0 0 0 0 r9\n"
    "movd %%mm1,20(%%edi) # move to memory r9\n"
    "por %%mm7,%%mm2\n"
    "movq %%mm3,%%mm7 # Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Y11 Y11 Y10 Y10\n"
    "pxor %%mm1,%%mm1\n"
    "punpcklwd %%mm7,%%mm3 # X X X X Y11 Y10 Y10 Y10\n"
    "punpcklbw %%mm1,%%mm3 # Y11 Y10 Y10 Y10\n"
    "psrlq $8,%%mm7 # 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11\n"
    "paddsw %%mm0,%%mm3 # r11 b10 g10 r10\n"
    "movq %%mm7,%%mm0 # 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11\n"
    "packuswb %%mm3,%%mm5 # r11 b10 g10 r10 b7 g7 r7 b6\n"
    "punpcklbw %%mm7,%%mm7 # X X X X Y14 Y14 Y11 Y11\n"
"# move to memory r11 b10 g10 r10 b7 g7 r7 b6\n"
    "pushl %%ecx\n"
    "movd %%mm5,%%ecx # put onto stack b7 g7 r7 b6\n"
    "shll $16,%%ecx # r7 b6 0 0\n"
    "orl %%ecx,8(%%edx) # or into memeory b6\n"
    "popl %%ecx\n"
    "psrlq $8,%%mm5 # 0 r11 b10 g10 r10 b7 g7 r7\n"
    "movd %%mm5,12(%%edx) # move to memory b7 g7 r7\n"
    "psrlq $24,%%mm5 # 0 0 0 0 r11 b10 g10 r10\n"
    "movd %%mm5,16(%%edx) # move to memory b10 g10 r10\n"
    "psrlq $24,%%mm5 # 0 0 0 0 0 0 0 r11\n"
    "movd %%mm5,20(%%edx) # move to memory r11\n"
    "movq %%mm6,%%mm3 # 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9\n"
    "punpcklbw %%mm6,%%mm6 # X X X X Y12 Y12 Y9 Y9\n"
    "punpcklbw %%mm1,%%mm7 # Y14 Y14 Y11 Y11\n"
    "punpcklbw %%mm1,%%mm6 # Y12 Y12 Y9 Y9\n"
    "paddsw %%mm2,%%mm7 # g14 r14 b11 g11\n"
    "paddsw %%mm2,%%mm6 # g12 r12 b9 g9\n"
    "psrlq $8,%%mm3 # 0 0 Y21 Y20 Y17 Y16 Y13 Y12\n"
    "movq %%mm3,%%mm1 # 0 0 Y21 Y20 Y17 Y16 Y13 Y12\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Y13 Y13 Y12 Y12\n"
    "addl $8,%%esi\n"
    "psrlq $16,%%mm3 # X X X X X X Y13 Y13 modified on 09/24\n"
    "punpcklwd %%mm3,%%mm1 # X X X X Y13 Y13 Y13 Y12\n"
    "addl $8,%%eax\n"
    "psrlq $8,%%mm0 # 0 0 Y23 Y22 Y19 Y18 Y15 Y14\n"
    "punpcklbw empty,%%mm1 # Y13 Y13 Y13 Y12\n"
    "movq %%mm0,%%mm5 # 0 0 Y23 Y22 Y19 Y18 Y15 Y14\n"
    "punpcklbw %%mm0,%%mm0 # X X X X Y15 Y15 Y14 Y14\n"
    "paddsw %%mm4,%%mm1 # b13 g13 r13 b12\n"
    "psrlq $16,%%mm0 # X X X X X X Y15 Y15\n"
    "addl $32,%%edi\n"
    "punpcklwd %%mm0,%%mm5 # X X X X Y15 Y15 Y15 Y14\n"
    "packuswb %%mm1,%%mm6 # b13 g13 r13 b12 g12 r12 b9 g9\n"
    "addl $32,%%edx\n"
    "punpcklbw empty,%%mm5 # Y15 Y15 Y15 Y14\n"
    "addl $4,%%ebx\n"
    "paddsw %%mm4,%%mm5 # b15 g15 r15 b14\n"
"# move to memory b13 g13 r13 b12 g12 r12 b9 g9\n"
    "pushl %%ecx\n"
    "movd %%mm6,%%ecx # put on stack g12 r12 b9 g9\n"
    "shll $8,%%ecx # r12 b9 g9 0\n"
    "orl %%ecx,-12(%%edi) # or into memory b9 g9\n"
    "popl %%ecx\n"
    "psrlq $16,%%mm6 # 0 0 b13 g13 r13 b12 g12 r12\n"
    "movd %%mm6,-8(%%edi) # move to memory b12 g12 r12\n"
    "psrlq $24,%%mm6 # 0 0 0 0 0 b13 g13 r13\n"
    "movd %%mm6,-4(%%edi) # move to memory b13 g13 r13\n"
    "packuswb %%mm5,%%mm7 # b15 g15 r15 b14 g14 r14 b11 g11\n"
    "addl $4,%%ecx\n"
"# move to memory b15 g15 r15 b14 g14 r14 b11 g11\n"
    "pushl %%eax\n"
    "movd %%mm7,%%eax # put on stack g14 r14 b11 g11\n"
    "shll $8,%%eax # r14 b11 g11 0\n"
    "orl %%eax,-12(%%edx) # or into memory b11 g11\n"
    "popl %%eax\n"
    "psrlq $16,%%mm7 # 0 0 b15 g15 r15 b14 g14 r14\n"
    "movd %%mm7,-8(%%edx) # move to memory b14 g14 r14\n"
    "psrlq $24,%%mm7 # 0 0 0 0 0 b15 g15 r15\n"
    "movd %%mm7,-4(%%edx) # move to memory b15 g15 r15\n"
"####\n"
    "addl $-8,segwidth\n"
    "jz end_of_line16_1\n"
"ltest16_1:\n"
    "addl $-16,picsize # cols_asm should be replaced by a register that has the num of cols\n"
    "testl $0xffffffff,picsize # sets flag to 0 if edx is 0\n"
    "jnz do_next16_1 # if cols_asm is not zero, loop\n"
    "jmp end16_1\n"
"end_of_line16_1:\n"
    "addl pstride,%%esi # yp += pstride\n"
    "addl cstride,%%ebx # up += cstride\n"
    "addl cstride,%%ecx # vp += cstride\n"
    "addl pstride2,%%edi # xip += pstride\n"
    "addl pstride,%%eax\n"
    "addl pstride2,%%edx\n"
    "movd segwidthconst,%%mm7\n"
    "movd %%mm7,segwidth\n"
    "jmp ltest16_1\n"
"end16_1:\n"
    "emms\n"

	  : /*"+S" (yp), "+b" (up), "+c" (vp), "+D" (xip), "+d" (xip2), "+a" (yp2)*/
	  : "S" (yp), "b" (up), "c" (vp), "D" (xip), "d" (xip2), "a" (yp2)
	  : "memory"
	  );
}

inline void TrueWindowRenderer24::map_422_asm(const u_char* yp, const u_char* up,
					      const u_char* vp, char* xip) const {
  __asm__ __volatile__(
    "#movl $inptr00, %%esi Y pointers\n"
    "#movl $inptr01, %%eax\n"
    "#movl Cr pointer, %%ebx\n"
    "#movl Cb pointer, %%ecx\n"
    "#movl $outptr0, %%edi output pointers\n"
    "#movl $outptr1, %%edx\n"
"do_next8_2:\n"
    "movd (%%ebx),%%mm0 # 0 0 0 0 Cr3 Cr2 Cr1 Cr0\n"
    "pxor %%mm6,%%mm6\n"
    "punpcklbw %%mm0,%%mm0 # Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0\n"
    "movq const128,%%mm7\n"
    "punpcklwd %%mm0,%%mm0 # Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0\n"
    "movq %%mm0,%%mm4\n"
    "punpcklbw %%mm6,%%mm0 # Cr0 Cr0 Cr0 Cr0\n"
    "psubsw %%mm7,%%mm0 # Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128\n"
    "movd (%%ecx),%%mm1 # 0 0 0 0 Cb3 Cb2 Cb1 Cb0\n"
    "psllw $2,%%mm0 # left shift by 2 bits\n"
    "punpcklbw %%mm1,%%mm1 # Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0\n"
    "paddsw const05,%%mm0 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm1,%%mm1 # Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0\n"
    "movq %%mm1,%%mm5\n"
    "pmulhw const1,%%mm0 # multiply by (fix(x) >> 1)\n"
    "punpcklbw %%mm6,%%mm1 # Cb0 Cb0 Cb0 Cb0\n"
    "punpckhbw %%mm6,%%mm4 # Cr1 Cr1 Cr1 Cr1\n"
    "psubsw %%mm7,%%mm1 # Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128\n"
    "punpckhbw %%mm6,%%mm5 # Cb1 Cb1 Cb1 Cb1\n"
    "psllw $2,%%mm1 # left shift by 2 bits\n"
    "paddsw const15,%%mm1 # add (one_half/fix(x)) << 2\n"
    "psubsw %%mm7,%%mm4 # Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128\n"
    "psubsw %%mm7,%%mm5 # Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128\n"
    "pmulhw const2,%%mm1 # multiply by (fix(x) >> 1)\n"
    "psllw $2,%%mm4 # left shift by 2 bits\n"
    "psllw $2,%%mm5 # left shift by 2 bits\n"
    "paddsw const45,%%mm4 # add (one_half/fix(x)) << 2\n"
    "movd (%%esi),%%mm7 # Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0\n"
    "pmulhw const5,%%mm4 # multiply by (fix(x) >> 1)\n"
    "movq %%mm7,%%mm6\n"
    "punpcklbw %%mm7,%%mm7 # Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0\n"
    "paddsw const55,%%mm5 # add (one_half/fix(x)) << 2\n"
    "paddsw %%mm1,%%mm0 # cred0 cbl0 cgr0 cred0\n"
    "movq %%mm7,%%mm1\n"
    "pmulhw const6,%%mm5 # multiply by (fix(x) >> 1)\n"
    "movq %%mm0,%%mm2 # cred0 cbl0 cgr0 cred0\n"
    "punpcklwd %%mm6,%%mm7 # Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0\n"
    "pand davemask,%%mm2 # 0 cbl0 cgr0 0\n"
    "psrlq $16,%%mm1 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "psrlq $16,%%mm2 # 0 0 cbl0 cgr0\n"
    "punpcklbw empty,%%mm7 # Y1 Y0 Y0 Y0\n"
    "paddsw %%mm5,%%mm4 # cbl1 cgr1 cred1 cbl1\n"
    "movq %%mm4,%%mm3 # cbl1 cgr1 cred1 cbl1\n"
    "pand davemask,%%mm3 # 0 cgr1 cred1 0\n"
    "paddsw %%mm0,%%mm7 # r1 b0 g0 r0\n"
    "psllq $16,%%mm3 # cgr1 cred1 0 0\n"
    "movq %%mm1,%%mm6 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "por %%mm3,%%mm2 # cgr1 cred1 cbl0 cgr0\n"
    "punpcklbw empty,%%mm6 # Y4 Y4 Y1 Y1\n"
    "paddsw %%mm2,%%mm6 # g4 r4 b1 g1\n"
    "packuswb %%mm6,%%mm7 # g4 r4 b1 g1 r1 b0 g0 r0\n"
"#    movd %%mm7,%%eax # r1 b0 g0 r0\n"
"#    andl $0xffffff,%%eax # 0 b0 g0 r0\n"
"#    movl %%eax,(%%edi) # move to memory b0 g0 r0\n"
"#    psrlq $24,%%mm7 # 0 0 0 g4 r4 b1 g1 r1\n"
"#    movd %%mm7,%%eax # r4 b1 g1 r1\n"
"#    andl $0xffffff,%%eax # 0 b1 g1 r1\n"
"#    movl %%eax,4(%%edi) # move to memory b1 g1 r1\n"
"#    psrlq $24,%%mm7 # 0 0 0 0 0 0 g4 r4\n"
"#    movd %%mm7,%%edx # 0 0 g4 r4\n"
    "movq %%mm7,(%%edi) # move to memory g4 r4 b1 g1 r1 b0 g0 r0\n"
    "movq %%mm1,%%mm0 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "psrlq $24,%%mm1 # 0 0 0 0 0 Y5 Y5 Y4\n"
    "psrlq $32,%%mm0 # 0 0 0 0 0 0 Y5 Y5\n"
    "punpcklwd %%mm0,%%mm1 # X X X X Y5 Y5 Y5 Y4\n"
    "movd (%%ebx),%%mm0 # 0 0 0 0 Cr5 Cr4 Cr3 Cr2\n"
    "psrlq $16,%%mm0\n"
    "punpcklbw empty,%%mm1 # Y5 Y5 Y5 Y4\n"
    "paddsw %%mm4,%%mm1 # b5 g5 r5 b4\n"
    "pxor %%mm6,%%mm6 # clear mm6 registr\n"
    "punpcklbw %%mm0,%%mm0 # X X X X Cr3 Cr3 Cr2 Cr2\n"
    "punpcklwd %%mm0,%%mm0 # Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2\n"
    "movq %%mm0,%%mm4\n"
    "movd (%%ecx),%%mm3 # 0 0 0 0 Cb5 Cb4 Cb3 Cb2\n"
    "punpcklbw %%mm6,%%mm0 # Cr2 Cr2 Cr2 Cr2\n"
    "psrlq $16,%%mm3\n"
    "psubsw const128,%%mm0 # Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Cb3 Cb3 Cb2 Cb2\n"
    "psllw $2,%%mm0 # left shift by 2 bits\n"
    "paddsw const05,%%mm0 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm3,%%mm3 # Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2\n"
    "movq %%mm3,%%mm7\n"
    "pmulhw const1,%%mm0 # multiply by (fix(x) >> 1)\n"
    "punpcklbw %%mm6,%%mm3 # Cb2 Cb2 Cb2 Cb2\n"
    "psubsw const128,%%mm3 # Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128\n"
    "punpckhbw %%mm6,%%mm4 # Cr3 Cr3 Cr3 Cr3\n"
    "psllw $2,%%mm3 # left shift by 2 bits\n"
    "paddsw const15,%%mm3 # add (one_half/fix(x)) << 2\n"
    "punpckhbw %%mm6,%%mm7 # Cb3 Cb3 Cb3 Cb3\n"
    "pmulhw const2,%%mm3 # multiply by (fix(x) >> 1)\n"
    "psubsw const128,%%mm7 # Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128\n"
    "paddsw %%mm3,%%mm0 # cred2 cbl2 cgr2 cred2\n"
    "psllw $2,%%mm7 # left shift by 2 bits\n"
    "psubsw const128,%%mm4 # Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128\n"
    "movd 4(%%esi),%%mm3 # Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8\n"
    "psllw $2,%%mm4 # left shift by 2 bits\n"
    "paddsw const55,%%mm7 # add (one_half/fix(x)) << 2\n"
    "movq %%mm3,%%mm6 # Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8\n"
    "movq %%mm0,%%mm2\n"
    "pand davemask,%%mm2\n"
    "punpcklbw %%mm3,%%mm3 # Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8\n"
    "psrlq $16,%%mm2\n"
    "paddsw const45,%%mm4 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm6,%%mm3 # X X X X Y9 Y8 Y8 Y8\n"
    "pmulhw const5,%%mm4 # multiply by (fix(x) >> 1)\n"
    "pmulhw const6,%%mm7 # multiply by (fix(x) >> 1)\n"
    "punpcklbw empty,%%mm3 # Y9 Y8 Y8 Y8\n"
    "paddsw %%mm7,%%mm4 # cbl3 cgr3 cred3 cbl3\n"
    "paddsw %%mm0,%%mm3 # r9 b8 g8 r8\n"
    "movq %%mm4,%%mm7\n"
    "packuswb %%mm3,%%mm1 # r9 b8 g8 r8 b5 g5 r5 b4\n"
    "pand davemask,%%mm7\n"
    "psrlq $8,%%mm6 # 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9\n"
    "psllq $16,%%mm7\n"
"#    movd %%mm1,%%eax # b5 g5 r5 b4\n"
"#    andl $0xff,%%eax # b4\n"
"#    shll $16,%%eax # 0 b4 0 0\n"
"#    orl %%eax,%%edx # 0 b4 g4 r4\n"
"#    movl %%edx,8(%%edi) # move to memory b4 g4 r4\n"
"#    movd %%mm1,%%eax # b5 g5 r5 b4\n"
"#    shrl $8,%%eax # 0 b5 g5 r5\n"
"#    movl %%eax,12(%%edi) # move to memory b5 g5 r5\n"
"#    psrlq $32,%%mm1 # 0 0 0 0 r9 b8 g8 r8\n"
"#    movd %%mm1,%%eax # r9 b8 g8 r8\n"
"#    andl $0xffffff,%%eax # 0 b8 g8 r8\n"
"#    movl %%eax,16(%%edi) # move to memory b8 g8 r8\n"
"#    movd %%mm1,%%edx # r9 b8 g8 r8\n"
"#    shrl $24,%%edx # 0 0 0 r9\n"
    "movq %%mm1,8(%%edi) # move to memory r9 b8 g8 r8 b5 g5 r5 b4\n"
    "por %%mm7,%%mm2\n"
    "pxor %%mm1,%%mm1\n"
    "movq %%mm6,%%mm3 # 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9\n"
    "punpcklbw %%mm6,%%mm6 # X X X X Y12 Y12 Y9 Y9\n"
    "punpcklbw %%mm1,%%mm6 # Y62212 Y12 Y9 Y9\n"
    "paddsw %%mm2,%%mm6 # g12 r12 b9 g9\n"
    "psrlq $8,%%mm3 # 0 0 Y21 Y20 Y17 Y16 Y13 Y12\n"
    "movq %%mm3,%%mm1 # 0 0 Y21 Y20 Y17 Y16 Y13 Y12\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Y13 Y13 Y12 Y12\n"
    "addl $8,%%esi\n"
    "psrlq $16,%%mm3 # X X X X X X Y13 Y13 modified on 09/24\n"
    "punpcklwd %%mm3,%%mm1 # X X X X Y13 Y13 Y13 Y12\n"
    "punpcklbw empty,%%mm1 # Y13 Y13 Y13 Y12\n"
    "paddsw %%mm4,%%mm1 # b13 g13 r13 b12\n"
    "addl $24,%%edi\n"
    "punpcklwd %%mm0,%%mm5 # X X X X Y15 Y15 Y15 Y14\n"
    "packuswb %%mm1,%%mm6 # b13 g13 r13 b12 g12 r12 b9 g9\n"
    "punpcklbw empty,%%mm5 # Y15 Y15 Y15 Y14\n"
    "addl $4,%%ebx\n"
"#    movd %%mm6,%%eax # g12 r12 b9 g9\n"
"#    andl $0xffff,%%eax # 0 0 b9 g9\n"
"#    shll $8,%%eax # 0 b9 g9 0\n"
"#    orl %%edx,%%eax # 0 b9 g9 r9\n"
"#    movl %%eax,-12(%%edi) # move to memory b9 g9 r9\n"
"#    psrlq $16,%%mm6 # 0 0 b13 g13 r13 b12 g12 r12\n"
"#    movd %%mm6,%%eax # r13 b12 g12 r12\n"
"#    andl $0xffffff,%%eax # 0 b12 g12 r12\n"
"#    movl %%eax,-8(%%edi) # move to memory b12 g12 r12\n"
"#    psrlq $24,%%mm6 # 0 0 0 0 0 b13 g13 r13\n"
"#    movd %%mm6,%%eax # 0 b13 g13 r13\n"
"#    movl %%eax,-4(%%edi) # move to memeory b13 g13 r13\n"
    "movq %%mm6,-8(%%edi) # move to memory b13 g13 r13 b12 g12 r12 b9 g9\n"
    "addl $4,%%ecx\n"
"####\n"
    "addl $-8,segwidth\n"
    "jz end_of_line8_2\n"
"ltest8_2:\n"
    "addl $-8,picsize # cols_asm should be replaced by a register that has the num of cols\n"
    "testl $0xffffffff,picsize # sets flag to 0 if edx is 0\n"
    "jnz do_next8_2 # if cols_asm is not zero, loop\n"
    "jmp end8_2\n"
"end_of_line8_2:\n"
    "addl pstride,%%esi # yp += pstride\n"
    "addl cstride,%%ebx # up += cstride\n"
    "addl cstride,%%ecx # vp += cstride\n"
    "addl pstride2,%%edi # xip += 3*pstride\n"
    "movl segwidthconst,%%eax\n"
    "movl %%eax,segwidth\n"
    "jmp ltest8_2\n"
"end8_2:\n"
    "emms\n"
	  : "=S" (yp), "=b" (up), "=c" (vp), "=D" (xip)
	  : "S" (yp), "b" (up), "c" (vp), "D" (xip)
	  : "memory", "%eax"
	  );
}

inline void TrueWindowRenderer24::map_411_asm(const u_char* yp, const u_char* up,
					      const u_char* vp, char* xip,
					      u_int iw) const {
  char* xip2 = xip + 3*iw;
  const u_char* yp2 = yp + iw;
  __asm__ __volatile__(
    "#movl $inptr00, %%esi Y pointers\n"
    "#movl $inptr01, %%eax\n"
    "#movl Cr pointer, %%ebx\n"
    "#movl Cb pointer, %%ecx\n"
    "#movl $outptr0, %%edi output pointers\n"
    "#movl $outptr1, %%edx\n"
"do_next16_2:\n"
    "movd (%%ebx),%%mm0 # 0 0 0 0 Cr3 Cr2 Cr1 Cr0\n"
    "pxor %%mm6,%%mm6\n"
    "punpcklbw %%mm0,%%mm0 # Cr3 Cr3 Cr2 Cr2 Cr1 Cr1 Cr0 Cr0\n"
    "movq const128,%%mm7\n"
    "punpcklwd %%mm0,%%mm0 # Cr1 Cr1 Cr1 Cr1 Cr0 Cr0 Cr0 Cr0\n"
    "movq %%mm0,%%mm4\n"
    "punpcklbw %%mm6,%%mm0 # Cr0 Cr0 Cr0 Cr0\n"
    "psubsw %%mm7,%%mm0 # Cr0 - 128:Cr0-128:Cr0-128:Cr0 -128\n"
    "movd (%%ecx),%%mm1 # 0 0 0 0 Cb3 Cb2 Cb1 Cb0\n"
    "psllw $2,%%mm0 # left shift by 2 bits\n"
    "punpcklbw %%mm1,%%mm1 # Cb3 Cb3 Cb2 Cb2 Cb1 Cb1 Cb0 Cb0\n"
    "paddsw const05,%%mm0 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm1,%%mm1 # Cb1 Cb1 Cb1 Cb1 Cb0 Cb0 Cb0 Cb0\n"
    "movq %%mm1,%%mm5\n"
    "pmulhw const1,%%mm0 # multiply by (fix(x) >> 1)\n"
    "punpcklbw %%mm6,%%mm1 # Cb0 Cb0 Cb0 Cb0\n"
    "punpckhbw %%mm6,%%mm4 # Cr1 Cr1 Cr1 Cr1\n"
    "psubsw %%mm7,%%mm1 # Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128\n"
    "punpckhbw %%mm6,%%mm5 # Cb1 Cb1 Cb1 Cb1\n"
    "psllw $2,%%mm1 # left shift by 2 bits\n"
    "paddsw const15,%%mm1 # add (one_half/fix(x)) << 2\n"
    "psubsw %%mm7,%%mm4 # Cr1 - 128:Cr1-128:Cr1-128:Cr1 -128\n"
    "psubsw %%mm7,%%mm5 # Cb1 - 128:Cb1-128:Cb1-128:Cb1 -128\n"
    "pmulhw const2,%%mm1 # multiply by (fix(x) >> 1)\n"
    "psllw $2,%%mm4 # left shift by 2 bits\n"
    "psllw $2,%%mm5 # left shift by 2 bits\n"
    "paddsw const45,%%mm4 # add (one_half/fix(x)) << 2\n"
    "movd (%%esi),%%mm7 # Y13 Y12 Y9 Y8 Y5 Y4 Y1 Y0\n"
    "pmulhw const5,%%mm4 # multiply by (fix(x) >> 1)\n"
    "movq %%mm7,%%mm6\n"
    "punpcklbw %%mm7,%%mm7 # Y5 Y5 Y4 Y4 Y1 Y1 Y0 Y0\n"
    "paddsw const55,%%mm5 # add (one_half/fix(x)) << 2\n"
    "paddsw %%mm1,%%mm0 # cred0 cbl0 cgr0 cred0\n"
    "movq %%mm7,%%mm1\n"
    "pmulhw const6,%%mm5 # multiply by (fix(x) >> 1)\n"
    "movq %%mm0,%%mm2 # cred0 cbl0 cgr0 cred0\n"
    "punpcklwd %%mm6,%%mm7 # Y5 Y4 Y1 Y1 Y1 Y0 Y0 Y0\n"
    "pand davemask,%%mm2 # 0 cbl0 cgr0 0\n"
    "psrlq $16,%%mm1 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "psrlq $16,%%mm2 # 0 0 cbl0 cgr0\n"
    "punpcklbw empty,%%mm7 # Y1 Y0 Y0 Y0\n"
    "paddsw %%mm5,%%mm4 # cbl1 cgr1 cred1 cbl1\n"
    "movq %%mm4,%%mm3 # cbl1 cgr1 cred1 cbl1\n"
    "pand davemask,%%mm3 # 0 cgr1 cred1 0\n"
    "paddsw %%mm0,%%mm7 # r1 b0 g0 r0\n"
    "psllq $16,%%mm3 # cgr1 cred1 0 0\n"
    "movq %%mm1,%%mm6 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "por %%mm3,%%mm2 # cgr1 cred1 cbl0 cgr0\n"
    "punpcklbw empty,%%mm6 # Y4 Y4 Y1 Y1\n"
    "movd (%%eax),%%mm3 # Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2\n"
    "paddsw %%mm2,%%mm6 # g4 r4 b1 g1\n"
    "packuswb %%mm6,%%mm7 # g4 r4 b1 g1 r1 b0 g0 r0\n"
    "movq %%mm3,%%mm6 # Y15 Y14 Y11 Y10 Y7 Y6 Y3 Y2\n"
    "punpcklbw %%mm3,%%mm3 # Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2\n"
    "movq %%mm7,(%%edi) # move to memory g4 r4 b1 g1 r1 b0 g0 r0\n"
    "movq %%mm3,%%mm5 # Y7 Y7 Y6 Y6 Y3 Y3 Y2 Y2\n"
    "punpcklwd %%mm6,%%mm3 # X X X X Y3 Y2 Y2 Y2\n"
    "punpcklbw empty,%%mm3 # Y3 Y2 Y2 Y2\n"
    "psrlq $16,%%mm5 # 0 0 Y7 Y7 Y6 Y6 Y3 Y3\n"
    "paddsw %%mm0,%%mm3 # r3 b2 g2 r2\n"
    "movq %%mm5,%%mm6 # 0 0 Y7 Y7 Y6 Y6 Y3 Y3\n"
    "movq %%mm1,%%mm0 # 0 0 Y5 Y5 Y4 Y4 Y1 Y1\n"
    "punpckldq %%mm6,%%mm6 # X X X X Y6 Y6 Y3 Y3\n"
    "punpcklbw empty,%%mm6 # Y6 Y6 Y3 Y3\n"
    "psrlq $24,%%mm1 # 0 0 0 0 0 Y5 Y5 Y4\n"
    "paddsw %%mm2,%%mm6 # g6 r6 b3 g3\n"
    "packuswb %%mm6,%%mm3 # g6 r6 b3 g3 r3 b2 g2 r2\n"
    "movq %%mm5,%%mm2 # 0 0 Y7 Y7 Y6 Y6 Y3 Y3\n"
    "psrlq $32,%%mm0 # 0 0 0 0 0 0 Y5 Y5\n"
    "movq %%mm3,(%%edx) # move to memory g6 r6 b3 g3 r3 b2 g2 r2\n"
    "punpcklwd %%mm0,%%mm1 # X X X X Y5 Y5 Y5 Y4\n"
    "psrlq $24,%%mm5 # 0 0 0 0 0 Y7 Y7 Y6\n"
    "movd (%%ebx),%%mm0 # 0 0 0 0 Cr5 Cr4 Cr3 Cr2\n"
    "psrlq $32,%%mm2 # 0 0 0 0 0 0 Y7 Y7\n"
    "psrlq $16,%%mm0\n"
    "punpcklbw empty,%%mm1 # Y5 Y5 Y5 Y4\n"
    "punpcklwd %%mm2,%%mm5 # X X X X Y7 Y7 Y7 Y6\n"
    "paddsw %%mm4,%%mm1 # b5 g5 r5 b4\n"
    "punpcklbw empty,%%mm5 # Y7 Y7 Y7 Y6\n"
    "pxor %%mm6,%%mm6 # clear mm6 registr\n"
    "punpcklbw %%mm0,%%mm0 # X X X X Cr3 Cr3 Cr2 Cr2\n"
    "paddsw %%mm4,%%mm5 # b7 g7 r7 b6\n"
    "punpcklwd %%mm0,%%mm0 # Cr3 Cr3 Cr3 Cr3 Cr2 Cr2 Cr2 Cr2\n"
    "movq %%mm0,%%mm4\n"
    "movd (%%ecx),%%mm3 # 0 0 0 0 Cb5 Cb4 Cb3 Cb2\n"
    "punpcklbw %%mm6,%%mm0 # Cr2 Cr2 Cr2 Cr2\n"
    "psrlq $16,%%mm3\n"
    "psubsw const128,%%mm0 # Cr2 - 128:Cr2-128:Cr2-128:Cr2 -128\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Cb3 Cb3 Cb2 Cb2\n"
    "psllw $2,%%mm0 # left shift by 2 bits\n"
    "paddsw const05,%%mm0 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm3,%%mm3 # Cb3 Cb3 Cb3 Cb3 Cb2 Cb2 Cb2 Cb2\n"
    "movq %%mm3,%%mm7\n"
    "pmulhw const1,%%mm0 # multiply by (fix(x) >> 1)\n"
    "punpcklbw %%mm6,%%mm3 # Cb2 Cb2 Cb2 Cb2\n"
    "psubsw const128,%%mm3 # Cb0 - 128:Cb0-128:Cb0-128:Cb0 -128\n"
    "punpckhbw %%mm6,%%mm4 # Cr3 Cr3 Cr3 Cr3\n"
    "psllw $2,%%mm3 # left shift by 2 bits\n"
    "paddsw const15,%%mm3 # add (one_half/fix(x)) << 2\n"
    "punpckhbw %%mm6,%%mm7 # Cb3 Cb3 Cb3 Cb3\n"
    "pmulhw const2,%%mm3 # multiply by (fix(x) >> 1)\n"
    "psubsw const128,%%mm7 # Cb3 - 128:Cb3-128:Cb3-128:Cb3 -128\n"
    "paddsw %%mm3,%%mm0 # cred2 cbl2 cgr2 cred2\n"
    "psllw $2,%%mm7 # left shift by 2 bits\n"
    "psubsw const128,%%mm4 # Cr3 - 128:Cr3-128:Cr3-128:Cr3 -128\n"
    "movd 4(%%esi),%%mm3 # Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8\n"
    "psllw $2,%%mm4 # left shift by 2 bits\n"
    "paddsw const55,%%mm7 # add (one_half/fix(x)) << 2\n"
    "movq %%mm3,%%mm6 # Y21 Y20 Y17 Y16 Y13 Y12 Y9 Y8\n"
    "movq %%mm0,%%mm2\n"
    "pand davemask,%%mm2\n"
    "punpcklbw %%mm3,%%mm3 # Y13 Y13 Y12 Y12 Y9 Y9 Y8 Y8\n"
    "psrlq $16,%%mm2\n"
    "paddsw const45,%%mm4 # add (one_half/fix(x)) << 2\n"
    "punpcklwd %%mm6,%%mm3 # X X X X Y9 Y8 Y8 Y8\n"
    "pmulhw const5,%%mm4 # multiply by (fix(x) >> 1)\n"
    "pmulhw const6,%%mm7 # multiply by (fix(x) >> 1)\n"
    "punpcklbw empty,%%mm3 # Y9 Y8 Y8 Y8\n"
    "paddsw %%mm7,%%mm4 # cbl3 cgr3 cred3 cbl3\n"
    "paddsw %%mm0,%%mm3 # r9 b8 g8 r8\n"
    "movq %%mm4,%%mm7\n"
    "packuswb %%mm3,%%mm1 # r9 b8 g8 r8 b5 g5 r5 b4\n"
    "movd 4(%%eax),%%mm3 # Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10\n"
    "pand davemask,%%mm7\n"
    "psrlq $8,%%mm6 # 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9\n"
    "psllq $16,%%mm7\n"
    "movq %%mm1,8(%%edi) # move to memory r9 b8 g8 r8 b5 g5 r5 b4\n"
    "por %%mm7,%%mm2\n"
    "movq %%mm3,%%mm7 # Y23 Y22 Y19 Y18 Y15 Y14 Y11 Y10\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Y11 Y11 Y10 Y10\n"
    "pxor %%mm1,%%mm1\n"
    "punpcklwd %%mm7,%%mm3 # X X X X Y11 Y10 Y10 Y10\n"
    "punpcklbw %%mm1,%%mm3 # Y11 Y10 Y10 Y10\n"
    "psrlq $8,%%mm7 # 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11\n"
    "paddsw %%mm0,%%mm3 # r11 b10 g10 r10\n"
    "movq %%mm7,%%mm0 # 0 Y23 Y22 Y19 Y18 Y15 Y14 Y11\n"
    "packuswb %%mm3,%%mm5 # r11 b10 g10 r10 b7 g7 r7 b6\n"
    "punpcklbw %%mm7,%%mm7 # X X X X Y14 Y14 Y11 Y11\n"
    "movq %%mm5,8(%%edx) # move to memory r11 b10 g10 r10 b7 g7 r7 b6\n"
    "movq %%mm6,%%mm3 # 0 Y21 Y20 Y17 Y16 Y13 Y12 Y9\n"
    "punpcklbw %%mm6,%%mm6 # X X X X Y12 Y12 Y9 Y9\n"
    "punpcklbw %%mm1,%%mm7 # Y14 Y14 Y11 Y11\n"
    "punpcklbw %%mm1,%%mm6 # Y12 Y12 Y9 Y9\n"
    "paddsw %%mm2,%%mm7 # g14 r14 b11 g11\n"
    "paddsw %%mm2,%%mm6 # g12 r12 b9 g9\n"
    "psrlq $8,%%mm3 # 0 0 Y21 Y20 Y17 Y16 Y13 Y12\n"
    "movq %%mm3,%%mm1 # 0 0 Y21 Y20 Y17 Y16 Y13 Y12\n"
    "punpcklbw %%mm3,%%mm3 # X X X X Y13 Y13 Y12 Y12\n"
    "addl $8,%%esi\n"
    "psrlq $16,%%mm3 # X X X X X X Y13 Y13 modified on 09/24\n"
    "punpcklwd %%mm3,%%mm1 # X X X X Y13 Y13 Y13 Y12\n"
    "addl $8,%%eax\n"
    "psrlq $8,%%mm0 # 0 0 Y23 Y22 Y19 Y18 Y15 Y14\n"
    "punpcklbw empty,%%mm1 # Y13 Y13 Y13 Y12\n"
    "movq %%mm0,%%mm5 # 0 0 Y23 Y22 Y19 Y18 Y15 Y14\n"
    "punpcklbw %%mm0,%%mm0 # X X X X Y15 Y15 Y14 Y14\n"
    "paddsw %%mm4,%%mm1 # b13 g13 r13 b12\n"
    "psrlq $16,%%mm0 # X X X X X X Y15 Y15\n"
    "addl $24,%%edi\n"
    "punpcklwd %%mm0,%%mm5 # X X X X Y15 Y15 Y15 Y14\n"
    "packuswb %%mm1,%%mm6 # b13 g13 r13 b12 g12 r12 b9 g9\n"
    "addl $24,%%edx\n"
    "punpcklbw empty,%%mm5 # Y15 Y15 Y15 Y14\n"
    "addl $4,%%ebx\n"
    "paddsw %%mm4,%%mm5 # b15 g15 r15 b14\n"
    "movq %%mm6,-8(%%edi) # move to memory b13 g13 r13 b12 g12 r12 b9 g9\n"
    "packuswb %%mm5,%%mm7 # b15 g15 r15 b14 g14 r14 b11 g11\n"
    "addl $4,%%ecx\n"
    "movq %%mm7,-8(%%edx) # move to memory b15 g15 r15 b14 g14 r14 b11 g11\n"
"####\n"
    "addl $-8,segwidth\n"
    "jz end_of_line16_2\n"
"ltest16_2:\n"
    "addl $-16,picsize # cols_asm should be replaced by a register that has the num of cols\n"
    "testl $0xffffffff,picsize # sets flag to 0 if edx is 0\n"
    "jnz do_next16_2 # if cols_asm is not zero, loop\n"
    "jmp end16_2\n"
"end_of_line16_2:\n"
    "addl pstride,%%esi # yp += pstride\n"
    "addl cstride,%%ebx # up += cstride\n"
    "addl cstride,%%ecx # vp += cstride\n"
    "addl pstride2,%%edi # xip += 3*pstride\n"
    "movd segwidthconst,%%mm7\n"
    "movd %%mm7,segwidth\n"
    "jmp ltest16_2\n"
"end16_2:\n"
    "emms\n"
	  : /*"=S" (yp), "=b" (up), "=c" (vp), "=D" (xip), "=d" (xip2), "=a" (yp2)*/
	  : "S" (yp), "b" (up), "c" (vp), "D" (xip), "d" (xip2), "a" (yp2)
	  : "memory"
	  );
}
#endif
