
#include <spu_intrinsics.h>

#define SHIFT (14)

#define YSCALE  ((int) (( 1.0    ) * (1<<SHIFT)))
#define BUSCALE ((int) (( 1.772  ) * (1<<SHIFT)))
#define GVSCALE ((int) ((-0.71414) * (1<<SHIFT)))
#define GUSCALE ((int) ((-0.34414) * (1<<SHIFT)))
#define RVSCALE ((int) (( 1.402  ) * (1<<SHIFT)))

void yuv2rgb_short(vector unsigned char *yp, vector unsigned char *up, vector unsigned char *vp, vector unsigned char *argbp, int count)
{
	int x;
	const vec_short8 yscale  = spu_splats((short)YSCALE);
	const vec_short8 buscale = spu_splats((short)BUSCALE);
	const vec_short8 gvscale = spu_splats((short)GVSCALE);
	const vec_short8 guscale = spu_splats((short)GUSCALE);
	const vec_short8 rvscale = spu_splats((short)RVSCALE);
	const vec_short8 clamp   = spu_splats((short) 255 );

	const vec_ushort8 shuffle_y_0  = (vec_ushort8) { 0x8000, 0x8001, 0x8002, 0x8003, 0x8004, 0x8005, 0x8006, 0x8007 };
	const vec_ushort8 shuffle_uv_0 = (vec_ushort8) { 0x8000, 0x8000, 0x8001, 0x8001, 0x8002, 0x8002, 0x8003, 0x8003 };

	const vec_ushort8 shuffle_toshort = (vec_ushort8) { 0x0203, 0x1213, 0x0607, 0x1617, 0x0a0b, 0x1a1b, 0x0e0f, 0x1e1f };

	const vec_uint4 shuffle_rg0    = (vec_uint4) { 0x80011180, 0x80031380, 0x80051580, 0x80071780 };
	const vec_uint4 shuffle_rg1    = (vec_uint4) { 0x80091980, 0x800b1b80, 0x800d1d80, 0x800f1f80 };

	const vec_uint4 shuffle_rgb0    = (vec_uint4) { 0x80010211, 0x80050613, 0x80090a15, 0x800d0e17 };
	const vec_uint4 shuffle_rgb1    = (vec_uint4) { 0x80010219, 0x8005061b, 0x80090a1d, 0x800d0e1f };

	for (x=0;x<count>>1;x+=16) {
		vec_short8 y;
		vec_short8 u;
		vec_short8 v;

		vec_short8 y0, y1;
		vec_short8 u0, u1;
		vec_short8 v0, v1;

		vec_int4 y0odd, y0even, y1odd, y1even;

		vec_short8 r0, r1;
		vec_short8 g0, g1;
		vec_short8 b0, b1;

		vec_int4 r0odd, r1odd, r0even, r1even;
		vec_int4 g0odd, g1odd, g0even, g1even;
		vec_int4 b0odd, b1odd, b0even, b1even;

		vec_ushort8 shuffle_y  = shuffle_y_0;
		vec_ushort8 shuffle_uv = shuffle_uv_0;

		y = (vec_short8)si_lqd((qword)spu_promote((unsigned int)yp, 0), 0);
		u = (vec_short8)si_lqx((qword)spu_promote((unsigned int)up, 0), (qword)spu_promote(x, 0));
		v = (vec_short8)si_lqx((qword)spu_promote((unsigned int)vp, 0), (qword)spu_promote(x, 0));

		// convert 16 YUV values into 2x8 value vectors
		y0 = spu_shuffle(y, y, (vec_uchar16)shuffle_y);
		shuffle_y = spu_add(shuffle_y, 8);
		u0 = spu_shuffle(u, u, (vec_uchar16)shuffle_uv);
		v0 = spu_shuffle(v, v, (vec_uchar16)shuffle_uv);
		shuffle_uv = spu_add(shuffle_uv, 4);

		y1 = spu_shuffle(y, y, (vec_uchar16)shuffle_y);
		//shuffle_y = spu_add(shuffle_y, 8);
		u1 = spu_shuffle(u, u, (vec_uchar16)shuffle_uv);
		v1 = spu_shuffle(v, v, (vec_uchar16)shuffle_uv);
		shuffle_uv = spu_add(shuffle_uv, 4);

		shuffle_y  = shuffle_y_0;
		y = (vec_short8)si_lqd((qword)spu_promote((unsigned int)yp, 0), 16);
		yp += 2;

		// pre-calculate common subexpressions
		y0odd = spu_mulo(y0, yscale);
		y0even = spu_mule(y0, yscale);
		v0 = spu_add(v0, -128);
		u0 = spu_add(u0, -128);

		y1odd = spu_mulo(y1, yscale);
		y1even = spu_mule(y1, yscale);
		v1 = spu_add(v1, -128);
		u1 = spu_add(u1, -128);

		// perform basic calculation
		r0odd = spu_madd(v0, rvscale, y0odd);
		g0odd = spu_madd(v0, gvscale, y0odd);
		g0odd = spu_madd(u0, guscale, g0odd);
		b0odd = spu_madd(u0, buscale, y0odd);

		r0even = spu_mhhadd(v0, rvscale, y0even);
		g0even = spu_mhhadd(v0, gvscale, y0even);
		g0even = spu_mhhadd(u0, guscale, g0even);
		b0even = spu_mhhadd(u0, buscale, y0even);

		r1odd = spu_madd(v1, rvscale, y1odd);
		g1odd = spu_madd(v1, gvscale, y1odd);
		g1odd = spu_madd(u1, guscale, g1odd);
		b1odd = spu_madd(u1, buscale, y1odd);

		r1even = spu_mhhadd(v1, rvscale, y1even);
		g1even = spu_mhhadd(v1, gvscale, y1even);
		g1even = spu_mhhadd(u1, guscale, g1even);
		b1even = spu_mhhadd(u1, buscale, y1even);

		// fix point
		r0odd = spu_rlmaska(r0odd, -SHIFT);
		r0even = spu_rlmaska(r0even, -SHIFT);
		g0odd = spu_rlmaska(g0odd, -SHIFT);
		g0even = spu_rlmaska(g0even, -SHIFT);
		b0odd = spu_rlmaska(b0odd, -SHIFT);
		b0even = spu_rlmaska(b0even, -SHIFT);

		r1odd = spu_rlmaska(r1odd, -SHIFT);
		r1even = spu_rlmaska(r1even, -SHIFT);
		g1odd = spu_rlmaska(g1odd, -SHIFT);
		g1even = spu_rlmaska(g1even, -SHIFT);
		b1odd = spu_rlmaska(b1odd, -SHIFT);
		b1even = spu_rlmaska(b1even, -SHIFT);

		// convert back to shorts
		r0 = (vec_short8)spu_shuffle(r0even, r0odd, (vec_uchar16)shuffle_toshort);
		g0 = (vec_short8)spu_shuffle(g0even, g0odd, (vec_uchar16)shuffle_toshort);
		b0 = (vec_short8)spu_shuffle(b0even, b0odd, (vec_uchar16)shuffle_toshort);

		r1 = (vec_short8)spu_shuffle(r1even, r1odd, (vec_uchar16)shuffle_toshort);
		g1 = (vec_short8)spu_shuffle(g1even, g1odd, (vec_uchar16)shuffle_toshort);
		b1 = (vec_short8)spu_shuffle(b1even, b1odd, (vec_uchar16)shuffle_toshort);

		// clamp to maximum
		r0 = spu_sel(r0, clamp,  spu_cmpgt(r0, clamp));
		g0 = spu_sel(g0, clamp,  spu_cmpgt(g0, clamp));
		b0 = spu_sel(b0, clamp,  spu_cmpgt(b0, clamp));

		r1 = spu_sel(r1, clamp,  spu_cmpgt(r1, clamp));
		g1 = spu_sel(g1, clamp,  spu_cmpgt(g1, clamp));
		b1 = spu_sel(b1, clamp,  spu_cmpgt(b1, clamp));

		// clamp to minimum
		r0 = spu_and(r0, (vec_short8)spu_cmpgt(r0, 0));
		g0 = spu_and(g0, (vec_short8)spu_cmpgt(g0, 0));
		b0 = spu_and(b0, (vec_short8)spu_cmpgt(b0, 0));

		r1 = spu_and(r1, (vec_short8)spu_cmpgt(r1, 0));
		g1 = spu_and(g1, (vec_short8)spu_cmpgt(g1, 0));
		b1 = spu_and(b1, (vec_short8)spu_cmpgt(b1, 0));

		// pack and write argb values
		argbp[0] = (vec_uchar16)spu_shuffle(spu_shuffle(r0, g0, (vec_uchar16)shuffle_rg0), b0, (vec_uchar16)shuffle_rgb0);
		argbp[1] = (vec_uchar16)spu_shuffle(spu_shuffle(r0, g0, (vec_uchar16)shuffle_rg1), b0, (vec_uchar16)shuffle_rgb1);
		argbp[2] = (vec_uchar16)spu_shuffle(spu_shuffle(r1, g1, (vec_uchar16)shuffle_rg0), b1, (vec_uchar16)shuffle_rgb0);
		argbp[3] = (vec_uchar16)spu_shuffle(spu_shuffle(r1, g1, (vec_uchar16)shuffle_rg1), b1, (vec_uchar16)shuffle_rgb1);

		// convert 16 YUV values into 2x8 value vectors
		y0 = spu_shuffle(y, y, (vec_uchar16)shuffle_y);
		shuffle_y = spu_add(shuffle_y, 8);
		u0 = spu_shuffle(u, u, (vec_uchar16)shuffle_uv);
		v0 = spu_shuffle(v, v, (vec_uchar16)shuffle_uv);
		shuffle_uv = spu_add(shuffle_uv, 4);

		y1 = spu_shuffle(y, y, (vec_uchar16)shuffle_y);
		//shuffle_y = spu_add(shuffle_y, 8);
		u1 = spu_shuffle(u, u, (vec_uchar16)shuffle_uv);
		v1 = spu_shuffle(v, v, (vec_uchar16)shuffle_uv);
		//shuffle_uv = spu_add(shuffle_uv, 4);

		// pre-calculate common subexpressions
		y0odd = spu_mulo(y0, yscale);
		y0even = spu_mule(y0, yscale);
		v0 = spu_add(v0, -128);
		u0 = spu_add(u0, -128);

		y1odd = spu_mulo(y1, yscale);
		y1even = spu_mule(y1, yscale);
		v1 = spu_add(v1, -128);
		u1 = spu_add(u1, -128);

		// perform basic calculation
		r0odd = spu_madd(v0, rvscale, y0odd);
		g0odd = spu_madd(v0, gvscale, y0odd);
		g0odd = spu_madd(u0, guscale, g0odd);
		b0odd = spu_madd(u0, buscale, y0odd);

		r0even = spu_mhhadd(v0, rvscale, y0even);
		g0even = spu_mhhadd(v0, gvscale, y0even);
		g0even = spu_mhhadd(u0, guscale, g0even);
		b0even = spu_mhhadd(u0, buscale, y0even);

		r1odd = spu_madd(v1, rvscale, y1odd);
		g1odd = spu_madd(v1, gvscale, y1odd);
		g1odd = spu_madd(u1, guscale, g1odd);
		b1odd = spu_madd(u1, buscale, y1odd);

		r1even = spu_mhhadd(v1, rvscale, y1even);
		g1even = spu_mhhadd(v1, gvscale, y1even);
		g1even = spu_mhhadd(u1, guscale, g1even);
		b1even = spu_mhhadd(u1, buscale, y1even);

		// fix point
		r0odd = spu_rlmaska(r0odd, -SHIFT);
		r0even = spu_rlmaska(r0even, -SHIFT);
		g0odd = spu_rlmaska(g0odd, -SHIFT);
		g0even = spu_rlmaska(g0even, -SHIFT);
		b0odd = spu_rlmaska(b0odd, -SHIFT);
		b0even = spu_rlmaska(b0even, -SHIFT);

		r1odd = spu_rlmaska(r1odd, -SHIFT);
		r1even = spu_rlmaska(r1even, -SHIFT);
		g1odd = spu_rlmaska(g1odd, -SHIFT);
		g1even = spu_rlmaska(g1even, -SHIFT);
		b1odd = spu_rlmaska(b1odd, -SHIFT);
		b1even = spu_rlmaska(b1even, -SHIFT);

		// convert back to shorts
		r0 = (vec_short8)spu_shuffle(r0even, r0odd, (vec_uchar16)shuffle_toshort);
		g0 = (vec_short8)spu_shuffle(g0even, g0odd, (vec_uchar16)shuffle_toshort);
		b0 = (vec_short8)spu_shuffle(b0even, b0odd, (vec_uchar16)shuffle_toshort);

		r1 = (vec_short8)spu_shuffle(r1even, r1odd, (vec_uchar16)shuffle_toshort);
		g1 = (vec_short8)spu_shuffle(g1even, g1odd, (vec_uchar16)shuffle_toshort);
		b1 = (vec_short8)spu_shuffle(b1even, b1odd, (vec_uchar16)shuffle_toshort);

		// clamp to maximum
		r0 = spu_sel(r0, clamp,  spu_cmpgt(r0, clamp));
		g0 = spu_sel(g0, clamp,  spu_cmpgt(g0, clamp));
		b0 = spu_sel(b0, clamp,  spu_cmpgt(b0, clamp));

		r1 = spu_sel(r1, clamp,  spu_cmpgt(r1, clamp));
		g1 = spu_sel(g1, clamp,  spu_cmpgt(g1, clamp));
		b1 = spu_sel(b1, clamp,  spu_cmpgt(b1, clamp));

		// clamp to minimum
		r0 = spu_and(r0, (vec_short8)spu_cmpgt(r0, 0));
		g0 = spu_and(g0, (vec_short8)spu_cmpgt(g0, 0));
		b0 = spu_and(b0, (vec_short8)spu_cmpgt(b0, 0));

		r1 = spu_and(r1, (vec_short8)spu_cmpgt(r1, 0));
		g1 = spu_and(g1, (vec_short8)spu_cmpgt(g1, 0));
		b1 = spu_and(b1, (vec_short8)spu_cmpgt(b1, 0));

		// pack and write argb values
		argbp[4] = (vec_uchar16)spu_shuffle(spu_shuffle(r0, g0, (vec_uchar16)shuffle_rg0), b0, (vec_uchar16)shuffle_rgb0);
		argbp[5] = (vec_uchar16)spu_shuffle(spu_shuffle(r0, g0, (vec_uchar16)shuffle_rg1), b0, (vec_uchar16)shuffle_rgb1);
		argbp[6] = (vec_uchar16)spu_shuffle(spu_shuffle(r1, g1, (vec_uchar16)shuffle_rg0), b1, (vec_uchar16)shuffle_rgb0);
		argbp[7] = (vec_uchar16)spu_shuffle(spu_shuffle(r1, g1, (vec_uchar16)shuffle_rg1), b1, (vec_uchar16)shuffle_rgb1);

		argbp += 8;
	}
}

static vec_uchar16 buffer[2048];

// yuv2rgb_short = 0.382
// yuv2rgb       = 0.535

// yuv2rgb_short_2 = 0.347

// nearest scale = 0.459
// linear scale  = 0.925

// time of 3 = 0.314
// time of 4 = 0.304

int main()
{
	int i;

	for (i = 0;i<100000;i++)
		yuv2rgb_short(buffer, buffer, buffer, buffer, 2048);
}


