
#include <spu_intrinsics.h>

#define SHIFT (14)

void yuv2rgb(vec_uchar16 *yp, vec_uchar16 *up, vec_uchar16 *vp,
	     vec_uchar16 *outp,
	     int count)
{
	vec_uint4 shuffle_y_0  = (vec_uint4) { 0x80808000, 0x80808001, 0x80808002, 0x80808003 };
	vec_uint4 shuffle_uv_0 = (vec_uint4) { 0x80808000, 0x80808000, 0x80808001, 0x80808001 };
	vec_int4 clamp = spu_splats( 255 );
	vec_int4 ym  = spu_splats((int)( 1.000 * ((1<<SHIFT))));
	vec_int4 bum = spu_splats((int)( 1.772 * ((1<<SHIFT))));
	vec_int4 gvm = spu_splats((int)(-0.71414 * ((1<<SHIFT))));
	vec_int4 gum = spu_splats((int)(-0.34414 * ((1<<SHIFT))));
	vec_int4 rvm = spu_splats((int)( 1.402 * ((1<<SHIFT))));
	int x;

	vec_uint4 shuffle_uv = shuffle_uv_0;

	for (x=0;x<count/16;x+=1) {
		vec_int4 y = (vec_int4)yp[x];
		vec_int4 u = (vec_int4)up[x/2];
		vec_int4 v = (vec_int4)vp[x/2];

		vec_int4 y0, y1, y2, y3;
		vec_int4 u0, u1, u2, u3;
		vec_int4 v0, v1, v2, v3;

		vec_int4 r0, r1, r2, r3;
		vec_int4 g0, g1, g2, g3;
		vec_int4 b0, b1, b2, b3;

		vec_uint4 shuffle_y = shuffle_y_0;

		vec_uint4 cmp0, cmp1, cmp2, cmp3;

		shuffle_uv = (x & 1) == 0 ? shuffle_uv_0 : shuffle_uv;

		// Convert planes of bytes into
		//  32 bit integer vectors
		y0 = spu_shuffle(y, y, (vec_uchar16)shuffle_y);
		shuffle_y = spu_add(shuffle_y, 4);
		u0 = spu_shuffle(u, u, (vec_uchar16)shuffle_uv);
		v0 = spu_shuffle(v, v, (vec_uchar16)shuffle_uv);
		shuffle_uv = spu_add(shuffle_uv, 2);

		y1 = spu_shuffle(y, y, (vec_uchar16)shuffle_y);
		shuffle_y = spu_add(shuffle_y, 4);
		u1 = spu_shuffle(u, u, (vec_uchar16)shuffle_uv);
		v1 = spu_shuffle(v, v, (vec_uchar16)shuffle_uv);
		shuffle_uv = spu_add(shuffle_uv, 2);

		y2 = spu_shuffle(y, y, (vec_uchar16)shuffle_y);
		shuffle_y = spu_add(shuffle_y, 4);
		u2 = spu_shuffle(u, u, (vec_uchar16)shuffle_uv);
		v2 = spu_shuffle(v, v, (vec_uchar16)shuffle_uv);
		shuffle_uv = spu_add(shuffle_uv, 2);

		y3 = spu_shuffle(y, y, (vec_uchar16)shuffle_y);
		//shuffle_y = spu_add(shuffle_y, 4);
		u3 = spu_shuffle(u, u, (vec_uchar16)shuffle_uv);
		v3 = spu_shuffle(v, v, (vec_uchar16)shuffle_uv);
		shuffle_uv = spu_add(shuffle_uv, 2);

		// yx = 1.164 * (y)
		y0 = spu_mulo((vec_short8)y0, (vec_short8)ym);
		y1 = spu_mulo((vec_short8)y1, (vec_short8)ym);
		y2 = spu_mulo((vec_short8)y2, (vec_short8)ym);
		y3 = spu_mulo((vec_short8)y3, (vec_short8)ym);

		v0 = spu_add(v0, -128);
		v1 = spu_add(v1, -128);
		v2 = spu_add(v2, -128);
		v3 = spu_add(v3, -128);

		u0 = spu_add(u0, -128);
		u1 = spu_add(u1, -128);
		u2 = spu_add(u2, -128);
		u3 = spu_add(u3, -128);

		// gx = 1.164*(y-16) - 0.813*(v-128)
		g0 = spu_madd((vec_short8)v0, (vec_short8)gvm, y0);
		g1 = spu_madd((vec_short8)v1, (vec_short8)gvm, y1);
		g2 = spu_madd((vec_short8)v2, (vec_short8)gvm, y2);
		g3 = spu_madd((vec_short8)v3, (vec_short8)gvm, y3);

		// rx = 1.164*(y-16) + 1.596*(v-128)
		r0 = spu_madd((vec_short8)v0, (vec_short8)rvm, y0);
		r1 = spu_madd((vec_short8)v1, (vec_short8)rvm, y1);
		r2 = spu_madd((vec_short8)v2, (vec_short8)rvm, y2);
		r3 = spu_madd((vec_short8)v3, (vec_short8)rvm, y3);

		// bx = 1.164*(y-16) + 2.018*(u-128)
		b0 = spu_madd((vec_short8)u0, (vec_short8)bum, y0);
		b1 = spu_madd((vec_short8)u1, (vec_short8)bum, y1);
		b2 = spu_madd((vec_short8)u2, (vec_short8)bum, y2);
		b3 = spu_madd((vec_short8)u3, (vec_short8)bum, y3);

		// gx = gx - 0.391 * (u-128)
		g0 = spu_madd((vec_short8)u0, (vec_short8)gum, g0);
		g1 = spu_madd((vec_short8)u1, (vec_short8)gum, g1);
		g2 = spu_madd((vec_short8)u2, (vec_short8)gum, g2);
		g3 = spu_madd((vec_short8)u3, (vec_short8)gum, g3);

		// fix point
		r0 = spu_rlmaska(r0, -SHIFT);
		r1 = spu_rlmaska(r1, -SHIFT);
		r2 = spu_rlmaska(r2, -SHIFT);
		r3 = spu_rlmaska(r3, -SHIFT);

		g0 = spu_rlmaska(g0, -SHIFT);
		g1 = spu_rlmaska(g1, -SHIFT);
		g2 = spu_rlmaska(g2, -SHIFT);
		g3 = spu_rlmaska(g3, -SHIFT);

		b0 = spu_rlmaska(b0, -SHIFT);
		b1 = spu_rlmaska(b1, -SHIFT);
		b2 = spu_rlmaska(b2, -SHIFT);
		b3 = spu_rlmaska(b3, -SHIFT);

		// clamp green max
		cmp0 = spu_cmpgt(g0, clamp);
		cmp1 = spu_cmpgt(g1, clamp);
		cmp2 = spu_cmpgt(g2, clamp);
		cmp3 = spu_cmpgt(g3, clamp);

		g0 = spu_sel(g0, clamp, cmp0);
		g1 = spu_sel(g1, clamp, cmp1);
		g2 = spu_sel(g2, clamp, cmp2);
		g3 = spu_sel(g3, clamp, cmp3);

		// min
		cmp0 = spu_cmpgt(g0, 0);
		cmp1 = spu_cmpgt(g1, 0);
		cmp2 = spu_cmpgt(g2, 0);
		cmp3 = spu_cmpgt(g3, 0);

		g0 = spu_and(g0, (vec_int4)cmp0);
		g1 = spu_and(g1, (vec_int4)cmp1);
		g2 = spu_and(g2, (vec_int4)cmp2);
		g3 = spu_and(g3, (vec_int4)cmp3);

		// clamp blue
		cmp0 = spu_cmpgt(b0, clamp);
		cmp1 = spu_cmpgt(b1, clamp);
		cmp2 = spu_cmpgt(b2, clamp);
		cmp3 = spu_cmpgt(b3, clamp);

		b0 = spu_sel(b0, clamp, cmp0);
		b1 = spu_sel(b1, clamp, cmp1);
		b2 = spu_sel(b2, clamp, cmp2);
		b3 = spu_sel(b3, clamp, cmp3);

		// min
		cmp0 = spu_cmpgt(b0, 0);
		cmp1 = spu_cmpgt(b1, 0);
		cmp2 = spu_cmpgt(b2, 0);
		cmp3 = spu_cmpgt(b3, 0);

		b0 = spu_and(b0, (vec_int4)cmp0);
		b1 = spu_and(b1, (vec_int4)cmp1);
		b2 = spu_and(b2, (vec_int4)cmp2);
		b3 = spu_and(b3, (vec_int4)cmp3);

		// clamp red
		cmp0 = spu_cmpgt(r0, clamp);
		cmp1 = spu_cmpgt(r1, clamp);
		cmp2 = spu_cmpgt(r2, clamp);
		cmp3 = spu_cmpgt(r3, clamp);

		r0 = spu_sel(r0, clamp, cmp0);
		r1 = spu_sel(r1, clamp, cmp1);
		r2 = spu_sel(r2, clamp, cmp2);
		r3 = spu_sel(r3, clamp, cmp3);

		// min
		cmp0 = spu_cmpgt(r0, 0);
		cmp1 = spu_cmpgt(r1, 0);
		cmp2 = spu_cmpgt(r2, 0);
		cmp3 = spu_cmpgt(r3, 0);

		r0 = spu_and(r0, (vec_int4)cmp0);
		r1 = spu_and(r1, (vec_int4)cmp1);
		r2 = spu_and(r2, (vec_int4)cmp2);
		r3 = spu_and(r3, (vec_int4)cmp3);

		// shift and form output
		g0 = spu_sl(g0, 8);
		g1 = spu_sl(g1, 8);
		g2 = spu_sl(g2, 8);
		g3 = spu_sl(g3, 8);

		b0 = spu_or(b0, g0);
		b1 = spu_or(b1, g1);
		b2 = spu_or(b2, g2);
		b3 = spu_or(b3, g3);

		r0 = spu_sl(r0, 16);
		r1 = spu_sl(r1, 16);
		r2 = spu_sl(r2, 16);
		r3 = spu_sl(r3, 16);

		b0 = spu_or(b0, r0);
		b1 = spu_or(b1, r1);
		b2 = spu_or(b2, r2);
		b3 = spu_or(b3, r3);

		outp[0] = (vec_uchar16)b0;
		outp[1] = (vec_uchar16)b1;
		outp[2] = (vec_uchar16)b2;
		outp[3] = (vec_uchar16)b3;
		outp += 4;
	}
}

static vec_uchar16 buffer[2048];

// yuv2rgb_short = 0.382
// yuv2rgb       = 0.535

// nearest scale = 0.459
// linear scale  = 0.925

int main()
{
	int i;

	for (i = 0;i<100000;i++)
		yuv2rgb(buffer, buffer, buffer, buffer, 2048);
}


