Index: firmware/target/arm/ipod/video/lcd-video.c =================================================================== --- firmware/target/arm/ipod/video/lcd-video.c (revision 15222) +++ firmware/target/arm/ipod/video/lcd-video.c (working copy) @@ -207,34 +207,6 @@ lcd_update_rect(0, 0, LCD_WIDTH, LCD_HEIGHT); } -/* YUV- > RGB565 conversion - * |R| |1.000000 -0.000001 1.402000| |Y'| - * |G| = |1.000000 -0.334136 -0.714136| |Pb| - * |B| |1.000000 1.772000 0.000000| |Pr| - * Scaled, normalized, rounded and tweaked to yield RGB 565: - * |R| |74 0 101| |Y' - 16| >> 9 - * |G| = |74 -24 -51| |Cb - 128| >> 8 - * |B| |74 128 0| |Cr - 128| >> 9 -*/ - -#define RGBYFAC 74 /* 1.0 */ -#define RVFAC 101 /* 1.402 */ -#define GVFAC (-51) /* -0.714136 */ -#define GUFAC (-24) /* -0.334136 */ -#define BUFAC 128 /* 1.772 */ - -/* ROUNDOFFS contain constant for correct round-offs as well as - constant parts of the conversion matrix (e.g. (Y'-16)*RGBYFAC - -> constant part = -16*RGBYFAC). Through extraction of these - constant parts we save at leat 4 substractions in the conversion - loop */ -#define ROUNDOFFSR (256 - 16*RGBYFAC - 128*RVFAC) -#define ROUNDOFFSG (128 - 16*RGBYFAC - 128*GVFAC - 128*GUFAC) -#define ROUNDOFFSB (256 - 16*RGBYFAC - 128*BUFAC) - -#define MAX_5BIT 0x1f -#define MAX_6BIT 0x3f - /* Performance function to blit a YUV bitmap directly to the LCD */ void lcd_yuv_blit(unsigned char * const src[3], int src_x, int src_y, int stride, @@ -279,83 +251,224 @@ /* wait for it to be write ready */ while ((inw(0x30030000) & 0x2) == 0); - const int ymax = y + height - 1; - const int stride_div_sub_x = stride >> 1; - unsigned char *ysrc = 0; - unsigned char *usrc = 0; - unsigned char *vsrc = 0; - unsigned char *row_end = 0; - int uvoffset; - int yp, up, vp, rc, gc, bc; /* temporary variables */ - int red1, green1, blue1; /* contain RGB of 1st pixel */ - int red2, green2, blue2; /* contain RGB of 2nd pixel */ - - for (; y <= ymax ; y++) + height >>= 1; + const int z = stride * src_y; + unsigned char const *ysrc = src[0] + z + src_x; + unsigned char const *usrc = src[1] + (z>>2) + (src_x>>1); + unsigned char const *vsrc = src[2] + (z>>2) + (src_x>>1); + + do { - /* upsampling, YUV->RGB conversion and reduction to RGB565 in one go */ - uvoffset = stride_div_sub_x*(src_y >> 1) + (src_x >> 1); - ysrc = src[0] + stride * src_y + src_x; - usrc = src[1] + uvoffset; - vsrc = src[2] + uvoffset; - - row_end = ysrc + width; - - do - { - up = *usrc++; - vp = *vsrc++; - rc = RVFAC * vp + ROUNDOFFSR; - gc = GVFAC * vp + GUFAC * up + ROUNDOFFSG; - bc = BUFAC * up + ROUNDOFFSB; + /* YUV -> RGB565 conversion + * |R| |1.000000 -0.000001 1.402000| |Y'| + * |G| = |1.000000 -0.334136 -0.714136| |Pb| + * |B| |1.000000 1.772000 0.000000| |Pr| + * Scaled, normalized, rounded and tweaked to yield RGB 565: + * |R| |74 0 101| |Y' - 16| >> 9 + * |G| = |74 -24 -51| |Cb - 128| >> 8 + * |B| |74 128 0| |Cr - 128| >> 9 + */ + asm volatile ( + /* set register for writing LCD-data */ + "mov r8, #0x30000000 \n\t" /* r7 = 0x30000000 */ - /* Pixel 1 -> RGB565 */ - yp = *ysrc++ * RGBYFAC; - red1 = (yp + rc) >> 9; - green1 = (yp + gc) >> 8; - blue1 = (yp + bc) >> 9; + /* set loop counter */ + "mov r7, %[cnt] \n\t" /* r8 = width */ + + /* 1st loop start */ + "10: \n\t" /* loop start */ - /* Pixel 2 -> RGB565 */ - yp = *ysrc++ * RGBYFAC; - red2 = (yp + rc) >> 9; - green2 = (yp + gc) >> 8; - blue2 = (yp + bc) >> 9; + "ldrb r0, [%[usrc]], #1 \n\t" /* r0 = *usrc++ = *Cb_p++ */ + "ldrb r1, [%[vsrc]], #1 \n\t" /* r1 = *vsrc++ = *Cr_p++ */ + "ldrb r5, [%[ysrc]], #1 \n\t" /* r5 = *ysrc++ = *Y'_p++ */ + + "sub r0, r0, #128 \n\t" /* r0 = Cb-128 */ + "sub r1, r1, #128 \n\t" /* r1 = Cr-128 */ + + "add r3, r1, r1, asl #1 \n\t" /* r3 = Cr*51 + Cb*24 */ + "add r3, r3, r3, asl #4 \n\t" + "add r3, r3, r0, asl #3 \n\t" + "add r3, r3, r0, asl #4 \n\t" + + "add r4, r1, r1, asl #2 \n\t" /* r1 = Cr*101 */ + "add r4, r4, r1, asl #5 \n\t" + "add r1, r4, r1, asl #6 \n\t" + + "add r1, r1, #256 \n\t" /* r1 = rv = (r1 + 256) >> 9 */ + "mov r1, r1, asr #9 \n\t" + "rsb r3, r3, #128 \n\t" /* r3 = guv = (-r3 + 128) >> 8 */ + "mov r3, r3, asr #8 \n\t" + "add r0, r0, #2 \n\t" /* r0 = bu = (Cb*128 + 256) >> 9 */ + "mov r0, r0, asr #2 \n\t" - /* Since out of bounds errors are relatively rare, we check two - pixels at once to see if any components are out of bounds, and - then fix whichever is broken. This works due to high values and - negative values both being !=0 when bitmasking them. - We first check for red and blue components (5bit range). */ - if ((red1 | blue1 | red2 | blue2) & ~MAX_5BIT) - { - if (red1 & ~MAX_5BIT) - red1 = (red1 >> 31) ? 0 : MAX_5BIT; - if (blue1 & ~MAX_5BIT) - blue1 = (blue1 >> 31) ? 0 : MAX_5BIT; - if (red2 & ~MAX_5BIT) - red2 = (red2 >> 31) ? 0 : MAX_5BIT; - if (blue2 & ~MAX_5BIT) - blue2 = (blue2 >> 31) ? 0 : MAX_5BIT; - } - /* We second check for green component (6bit range) */ - if ((green1 | green2) & ~MAX_6BIT) - { - if (green1 & ~MAX_6BIT) - green1 = (green1 >> 31) ? 0 : MAX_6BIT; - if (green2 & ~MAX_6BIT) - green2 = (green2 >> 31) ? 0 : MAX_6BIT; - } + /* 1st loop, first pixel */ + "sub r5, r5, #16 \n\t" /* r5 = (Y'-16) * 74 */ + "add r2, r5, r5, asl #2 \n\t" + "add r5, r2, r5, asl #5 \n\t" - /* pixel1 */ - outw((red1 << 11) | (green1 << 5) | blue1, 0x30000000); + "add r6, r1, r5, asr #8 \n\t" /* r6 = r = (Y >> 9) + rv */ + "add r2, r3, r5, asr #7 \n\t" /* r2 = g = (Y >> 8) + guv */ + "add r4, r0, r5, asr #8 \n\t" /* r4 = b = (Y >> 9) + bu */ + + "orr r5, r6, r4 \n\t" /* check if clamping is needed... */ + "orr r5, r5, r2, asr #1 \n\t" /* ...at all */ + "cmp r5, #31 \n\t" + "bls 15f \n\t" /* -> no clamp */ + "cmp r6, #31 \n\t" /* clamp r */ + "mvnhi r6, r6, asr #31 \n\t" + "andhi r6, r6, #31 \n\t" + "cmp r2, #63 \n\t" /* clamp g */ + "mvnhi r2, r2, asr #31 \n\t" + "andhi r2, r2, #63 \n\t" + "cmp r4, #31 \n\t" /* clamp b */ + "mvnhi r4, r4, asr #31 \n\t" + "andhi r4, r4, #31 \n\t" + "15: \n\t" /* no clamp */ + + "orr r4, r4, r2, lsl #5 \n\t" /* pixel = r<<11 | g<<5 | b */ + "orr r4, r4, r6, lsl #11 \n\t" + "strh r4, [r8] \n\t" /* write pixel */ + + /* 1st loop, second pixel */ + "ldrb r5, [%[ysrc]], #1 \n\t" /* r5 = *ysrc++ = *Y'_p++ */ + "sub r5, r5, #16 \n\t" /* r5 = (Y'-16) * 74 */ + "add r2, r5, r5, asl #2 \n\t" + "add r5, r2, r5, asl #5 \n\t" + + "add r6, r1, r5, asr #8 \n\t" /* r6 = r = (Y >> 9) + rv */ + "add r2, r3, r5, asr #7 \n\t" /* r2 = g = (Y >> 8) + guv */ + "add r4, r0, r5, asr #8 \n\t" /* r4 = b = (Y >> 9) + bu */ + + "orr r5, r6, r4 \n\t" /* check if clamping is needed... */ + "orr r5, r5, r2, asr #1 \n\t" /* ...at all */ + "cmp r5, #31 \n\t" + "bls 15f \n\t" /* -> no clamp */ + "cmp r6, #31 \n\t" /* clamp r */ + "mvnhi r6, r6, asr #31 \n\t" + "andhi r6, r6, #31 \n\t" + "cmp r2, #63 \n\t" /* clamp g */ + "mvnhi r2, r2, asr #31 \n\t" + "andhi r2, r2, #63 \n\t" + "cmp r4, #31 \n\t" /* clamp b */ + "mvnhi r4, r4, asr #31 \n\t" + "andhi r4, r4, #31 \n\t" + "15: \n\t" /* no clamp */ + + "orr r4, r4, r2, lsl #5 \n\t" /* pixel = r<<11 | g<<5 | b */ + "orr r4, r4, r6, lsl #11 \n\t" + "strh r4, [r8] \n\t" /* write pixel */ + + "subs r7, r7, #2 \n\t" /* check for loop end */ + "bgt 10b \n\t" /* back to beginning */ + /* 1st loop end */ + + /* set loop counter */ + "mov r7, %[cnt] \n\t" /* r8 = width */ + + /* set correct adresses for next loop */ + "sub %[usrc], %[usrc], %[cnt], asr #1 \n\t" /* *usrc -= width/2 */ + "sub %[vsrc], %[vsrc], %[cnt], asr #1 \n\t" /* *vsrc -= width/2 */ + "sub %[ysrc], %[ysrc], %[cnt] \n\t" /* *ysrc += stride - width */ + "add %[ysrc], %[ysrc], %[stride] \n\t" + + /* 2nd loop start */ + "20: \n\t" /* loop start */ + + "ldrb r0, [%[usrc]], #1 \n\t" /* r0 = *usrc++ = *Cb_p++ */ + "ldrb r1, [%[vsrc]], #1 \n\t" /* r1 = *vsrc++ = *Cr_p++ */ + "ldrb r5, [%[ysrc]], #1 \n\t" /* r5 = *ysrc++ = *Y'_p++ */ - /* pixel2 */ - outw((red2 << 11) | (green2 << 5) | blue2, 0x30000000); - } - while (ysrc < row_end); + "sub r0, r0, #128 \n\t" /* r0 = Cb-128 */ + "sub r1, r1, #128 \n\t" /* r1 = Cr-128 */ - src_y++; - } + "add r3, r1, r1, asl #1 \n\t" /* r3 = Cr*51 + Cb*24 */ + "add r3, r3, r3, asl #4 \n\t" + "add r3, r3, r0, asl #3 \n\t" + "add r3, r3, r0, asl #4 \n\t" + "add r4, r1, r1, asl #2 \n\t" /* r1 = Cr*101 */ + "add r4, r4, r1, asl #5 \n\t" + "add r1, r4, r1, asl #6 \n\t" + + "add r1, r1, #256 \n\t" /* r1 = rv = (r1 + 256) >> 9 */ + "mov r1, r1, asr #9 \n\t" + "rsb r3, r3, #128 \n\t" /* r3 = guv = (-r3 + 128) >> 8 */ + "mov r3, r3, asr #8 \n\t" + "add r0, r0, #2 \n\t" /* r0 = bu = (Cb*128 + 256) >> 9 */ + "mov r0, r0, asr #2 \n\t" + + /* 2nd loop, first pixel */ + "sub r5, r5, #16 \n\t" /* r5 = (Y'-16) * 74 */ + "add r2, r5, r5, asl #2 \n\t" + "add r5, r2, r5, asl #5 \n\t" + + "add r6, r1, r5, asr #8 \n\t" /* r6 = r = (Y >> 9) + rv */ + "add r2, r3, r5, asr #7 \n\t" /* r2 = g = (Y >> 8) + guv */ + "add r4, r0, r5, asr #8 \n\t" /* r4 = b = (Y >> 9) + bu */ + + "orr r5, r6, r4 \n\t" /* check if clamping is needed... */ + "orr r5, r5, r2, asr #1 \n\t" /* ...at all */ + "cmp r5, #31 \n\t" + "bls 15f \n\t" /* -> no clamp */ + "cmp r6, #31 \n\t" /* clamp r */ + "mvnhi r6, r6, asr #31 \n\t" + "andhi r6, r6, #31 \n\t" + "cmp r2, #63 \n\t" /* clamp g */ + "mvnhi r2, r2, asr #31 \n\t" + "andhi r2, r2, #63 \n\t" + "cmp r4, #31 \n\t" /* clamp b */ + "mvnhi r4, r4, asr #31 \n\t" + "andhi r4, r4, #31 \n\t" + "15: \n\t" /* no clamp */ + + "orr r4, r4, r2, lsl #5 \n\t" /* pixel = r<<11 | g<<5 | b */ + "orr r4, r4, r6, lsl #11 \n\t" + "strh r4, [r8] \n\t" /* write pixel */ + + /* 2nd loop, second pixel */ + "ldrb r5, [%[ysrc]], #1 \n\t" /* r5 = *ysrc++ = *Y'_p++ */ + "sub r5, r5, #16 \n\t" /* r5 = (Y'-16) * 74 */ + "add r2, r5, r5, asl #2 \n\t" + "add r5, r2, r5, asl #5 \n\t" + + "add r6, r1, r5, asr #8 \n\t" /* r6 = r = (Y >> 9) + rv */ + "add r2, r3, r5, asr #7 \n\t" /* r2 = g = (Y >> 8) + guv */ + "add r4, r0, r5, asr #8 \n\t" /* r4 = b = (Y >> 9) + bu */ + + "orr r5, r6, r4 \n\t" /* check if clamping is needed... */ + "orr r5, r5, r2, asr #1 \n\t" /* ...at all */ + "cmp r5, #31 \n\t" + "bls 15f \n\t" /* -> no clamp */ + "cmp r6, #31 \n\t" /* clamp r */ + "mvnhi r6, r6, asr #31 \n\t" + "andhi r6, r6, #31 \n\t" + "cmp r2, #63 \n\t" /* clamp g */ + "mvnhi r2, r2, asr #31 \n\t" + "andhi r2, r2, #63 \n\t" + "cmp r4, #31 \n\t" /* clamp b */ + "mvnhi r4, r4, asr #31 \n\t" + "andhi r4, r4, #31 \n\t" + "15: \n\t" /* no clamp */ + + "orr r4, r4, r2, lsl #5 \n\t" /* pixel = r<<11 | g<<5 | b */ + "orr r4, r4, r6, lsl #11 \n\t" + "strh r4, [r8] \n\t" /* write pixel */ + + "subs r7, r7, #2 \n\t" /* check for loop end */ + "bgt 20b \n\t" /* back to beginning */ + /* 2nd loop end */ + + : [usrc]"+r"(usrc), [vsrc]"+r"(vsrc), [ysrc]"+r"(ysrc) + : [cnt]"r"(width), [stride]"r"(stride) + : "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7", "r8" + ); + + ysrc += (stride ) - width; + usrc += (stride>>1) - (width>>1); + vsrc += (stride>>1) - (width>>1); + + } while (--height>0); + /* Top-half of original lcd_bcm_finishup() function */ outw(0x31, 0x30030000);