Index: firmware/target/arm/ipod/video/lcd-video.c =================================================================== --- firmware/target/arm/ipod/video/lcd-video.c (revision 15247) +++ firmware/target/arm/ipod/video/lcd-video.c (working copy) @@ -207,34 +207,6 @@ lcd_update_rect(0, 0, LCD_WIDTH, LCD_HEIGHT); } -/* YUV- > RGB565 conversion - * |R| |1.000000 -0.000001 1.402000| |Y'| - * |G| = |1.000000 -0.334136 -0.714136| |Pb| - * |B| |1.000000 1.772000 0.000000| |Pr| - * Scaled, normalized, rounded and tweaked to yield RGB 565: - * |R| |74 0 101| |Y' - 16| >> 9 - * |G| = |74 -24 -51| |Cb - 128| >> 8 - * |B| |74 128 0| |Cr - 128| >> 9 -*/ - -#define RGBYFAC 74 /* 1.0 */ -#define RVFAC 101 /* 1.402 */ -#define GVFAC (-51) /* -0.714136 */ -#define GUFAC (-24) /* -0.334136 */ -#define BUFAC 128 /* 1.772 */ - -/* ROUNDOFFS contain constant for correct round-offs as well as - constant parts of the conversion matrix (e.g. (Y'-16)*RGBYFAC - -> constant part = -16*RGBYFAC). Through extraction of these - constant parts we save at leat 4 substractions in the conversion - loop */ -#define ROUNDOFFSR (256 - 16*RGBYFAC - 128*RVFAC) -#define ROUNDOFFSG (128 - 16*RGBYFAC - 128*GVFAC - 128*GUFAC) -#define ROUNDOFFSB (256 - 16*RGBYFAC - 128*BUFAC) - -#define MAX_5BIT 0x1f -#define MAX_6BIT 0x3f - /* Performance function to blit a YUV bitmap directly to the LCD */ void lcd_yuv_blit(unsigned char * const src[3], int src_x, int src_y, int stride, @@ -243,6 +215,13 @@ int src_x, int src_y, int stride, int x, int y, int width, int height) { + static unsigned char chroma_buf[LCD_WIDTH/2*3]; /* 480 bytes for iPod Video */ + const int z = stride * src_y; + unsigned char const *ysrc = src[0] + z + src_x; + unsigned char const *usrc = src[1] + (z>>2) + (src_x>>1); + unsigned char const *vsrc = src[2] + (z>>2) + (src_x>>1); + + /* width must be dividible by 2 */ width = (width + 1) & ~1; if (finishup_needed) @@ -279,83 +258,206 @@ /* wait for it to be write ready */ while ((inw(0x30030000) & 0x2) == 0); - const int ymax = y + height - 1; - const int stride_div_sub_x = stride >> 1; - unsigned char *ysrc = 0; - unsigned char *usrc = 0; - unsigned char *vsrc = 0; - unsigned char *row_end = 0; - int uvoffset; - int yp, up, vp, rc, gc, bc; /* temporary variables */ - int red1, green1, blue1; /* contain RGB of 1st pixel */ - int red2, green2, blue2; /* contain RGB of 2nd pixel */ - - for (; y <= ymax ; y++) + height >>= 1; /* we do 2 lines in one turn */ + do { - /* upsampling, YUV->RGB conversion and reduction to RGB565 in one go */ - uvoffset = stride_div_sub_x*(src_y >> 1) + (src_x >> 1); - ysrc = src[0] + stride * src_y + src_x; - usrc = src[1] + uvoffset; - vsrc = src[2] + uvoffset; - - row_end = ysrc + width; - - do - { - up = *usrc++; - vp = *vsrc++; - rc = RVFAC * vp + ROUNDOFFSR; - gc = GVFAC * vp + GUFAC * up + ROUNDOFFSG; - bc = BUFAC * up + ROUNDOFFSB; + /* YUV -> RGB565 conversion + * |R| |1.000000 -0.000001 1.402000| |Y'| + * |G| = |1.000000 -0.334136 -0.714136| |Pb| + * |B| |1.000000 1.772000 0.000000| |Pr| + * Scaled, normalized, rounded and tweaked to yield RGB 565: + * |R| |74 0 101| |Y' - 16| >> 9 + * |G| = |74 -24 -51| |Cb - 128| >> 8 + * |B| |74 128 0| |Cr - 128| >> 9 + */ + asm volatile ( + /* set loop counter */ + "mov r7, %[cnt] \n\t" /* r7 = width */ - /* Pixel 1 -> RGB565 */ - yp = *ysrc++ * RGBYFAC; - red1 = (yp + rc) >> 9; - green1 = (yp + gc) >> 8; - blue1 = (yp + bc) >> 9; + /* 1st loop start */ + "10: \n\t" /* loop start */ - /* Pixel 2 -> RGB565 */ - yp = *ysrc++ * RGBYFAC; - red2 = (yp + rc) >> 9; - green2 = (yp + gc) >> 8; - blue2 = (yp + bc) >> 9; + "ldrb r0, [%[usrc]], #1 \n\t" /* r0 = *usrc++ = *Cb_p++ */ + "ldrb r1, [%[vsrc]], #1 \n\t" /* r1 = *vsrc++ = *Cr_p++ */ + "ldrb r5, [%[ysrc]], #1 \n\t" /* r5 = *ysrc++ = *Y'_p++ */ + + "sub r0, r0, #128 \n\t" /* r0 = Cb-128 */ + "sub r1, r1, #128 \n\t" /* r1 = Cr-128 */ + + "add r3, r1, r1, asl #1 \n\t" /* r3 = Cr*51 + Cb*24 */ + "add r3, r3, r3, asl #4 \n\t" + "add r3, r3, r0, asl #3 \n\t" + "add r3, r3, r0, asl #4 \n\t" + + "add r4, r1, r1, asl #2 \n\t" /* r1 = Cr*101 */ + "add r4, r4, r1, asl #5 \n\t" + "add r1, r4, r1, asl #6 \n\t" + + "add r1, r1, #256 \n\t" /* r1 = rv = (r1 + 256) >> 9 */ + "mov r1, r1, asr #9 \n\t" + "strb r1, [%[buf]], #1 \n\t" /* store r1 to chroma_buf */ + "rsb r3, r3, #128 \n\t" /* r3 = guv = (-r3 + 128) >> 8 */ + "mov r3, r3, asr #8 \n\t" + "strb r3, [%[buf]], #1 \n\t" /* store r3 to chroma_buf */ + "add r0, r0, #2 \n\t" /* r0 = bu = (Cb*128 + 256) >> 9 */ + "mov r0, r0, asr #2 \n\t" + "strb r0, [%[buf]], #1 \n\t" /* store r0 to chroma_buf */ - /* Since out of bounds errors are relatively rare, we check two - pixels at once to see if any components are out of bounds, and - then fix whichever is broken. This works due to high values and - negative values both being !=0 when bitmasking them. - We first check for red and blue components (5bit range). */ - if ((red1 | blue1 | red2 | blue2) & ~MAX_5BIT) - { - if (red1 & ~MAX_5BIT) - red1 = (red1 >> 31) ? 0 : MAX_5BIT; - if (blue1 & ~MAX_5BIT) - blue1 = (blue1 >> 31) ? 0 : MAX_5BIT; - if (red2 & ~MAX_5BIT) - red2 = (red2 >> 31) ? 0 : MAX_5BIT; - if (blue2 & ~MAX_5BIT) - blue2 = (blue2 >> 31) ? 0 : MAX_5BIT; - } - /* We second check for green component (6bit range) */ - if ((green1 | green2) & ~MAX_6BIT) - { - if (green1 & ~MAX_6BIT) - green1 = (green1 >> 31) ? 0 : MAX_6BIT; - if (green2 & ~MAX_6BIT) - green2 = (green2 >> 31) ? 0 : MAX_6BIT; - } + /* 1st loop, first pixel */ + "sub r5, r5, #16 \n\t" /* r5 = (Y'-16) * 74 */ + "add r2, r5, r5, asl #2 \n\t" + "add r5, r2, r5, asl #5 \n\t" - /* pixel1 */ - outw((red1 << 11) | (green1 << 5) | blue1, 0x30000000); + "add r6, r1, r5, asr #8 \n\t" /* r6 = r = (Y >> 9) + rv */ + "add r2, r3, r5, asr #7 \n\t" /* r2 = g = (Y >> 8) + guv */ + "add r4, r0, r5, asr #8 \n\t" /* r4 = b = (Y >> 9) + bu */ + + "orr r5, r6, r4 \n\t" /* check if clamping is needed... */ + "orr r5, r5, r2, asr #1 \n\t" /* ...at all */ + "cmp r5, #31 \n\t" + "bls 15f \n\t" /* -> no clamp */ + "cmp r6, #31 \n\t" /* clamp r */ + "mvnhi r6, r6, asr #31 \n\t" + "andhi r6, r6, #31 \n\t" + "cmp r2, #63 \n\t" /* clamp g */ + "mvnhi r2, r2, asr #31 \n\t" + "andhi r2, r2, #63 \n\t" + "cmp r4, #31 \n\t" /* clamp b */ + "mvnhi r4, r4, asr #31 \n\t" + "andhi r4, r4, #31 \n\t" + "15: \n\t" /* no clamp */ + + "orr r4, r4, r2, lsl #5 \n\t" /* pixel = r<<11 | g<<5 | b */ + "orr r4, r4, r6, lsl #11 \n\t" + "mov r6, #0x30000000 \n\t" + "strh r4, [r6] \n\t" /* write pixel */ + + /* 1st loop, second pixel */ + "ldrb r5, [%[ysrc]], #1 \n\t" /* r5 = *ysrc++ = *Y'_p++ */ + "sub r5, r5, #16 \n\t" /* r5 = (Y'-16) * 74 */ + "add r2, r5, r5, asl #2 \n\t" + "add r5, r2, r5, asl #5 \n\t" + + "add r6, r1, r5, asr #8 \n\t" /* r6 = r = (Y >> 9) + rv */ + "add r2, r3, r5, asr #7 \n\t" /* r2 = g = (Y >> 8) + guv */ + "add r4, r0, r5, asr #8 \n\t" /* r4 = b = (Y >> 9) + bu */ + + "orr r5, r6, r4 \n\t" /* check if clamping is needed... */ + "orr r5, r5, r2, asr #1 \n\t" /* ...at all */ + "cmp r5, #31 \n\t" + "bls 15f \n\t" /* -> no clamp */ + "cmp r6, #31 \n\t" /* clamp r */ + "mvnhi r6, r6, asr #31 \n\t" + "andhi r6, r6, #31 \n\t" + "cmp r2, #63 \n\t" /* clamp g */ + "mvnhi r2, r2, asr #31 \n\t" + "andhi r2, r2, #63 \n\t" + "cmp r4, #31 \n\t" /* clamp b */ + "mvnhi r4, r4, asr #31 \n\t" + "andhi r4, r4, #31 \n\t" + "15: \n\t" /* no clamp */ + + "orr r4, r4, r2, lsl #5 \n\t" /* pixel = r<<11 | g<<5 | b */ + "orr r4, r4, r6, lsl #11 \n\t" + "mov r6, #0x30000000 \n\t" + "strh r4, [r6] \n\t" /* write pixel */ + + "subs r7, r7, #2 \n\t" /* check for loop end */ + "bgt 10b \n\t" /* back to beginning */ + /* 1st loop end */ + + /* set loop counter */ + "mov r7, %[cnt] \n\t" /* r8 = width */ + + /* set correct adresses for next loop */ + "sub %[buf] , %[buf] , %[cnt], asl #1 \n\t" /* *buf -= width/2 * 3 */ + "add %[buf] , %[buf] , %[cnt], asr #1 \n\t" + "sub %[ysrc], %[ysrc], %[cnt] \n\t" /* *ysrc += stride - width */ + "add %[ysrc], %[ysrc], %[stride] \n\t" + + /* 2nd loop start */ + "20: \n\t" /* loop start */ + + /* restore r1, r3 and r0 from chroma buffer */ + "ldrsb r1, [%[buf]], #1 \n\t" + "ldrsb r3, [%[buf]], #1 \n\t" + "ldrsb r0, [%[buf]], #1 \n\t" + + /* 2nd loop, first pixel */ + "ldrb r5, [%[ysrc]], #1 \n\t" /* r5 = *ysrc++ = *Y'_p++ */ + "sub r5, r5, #16 \n\t" /* r5 = (Y'-16) * 74 */ + "add r2, r5, r5, asl #2 \n\t" + "add r5, r2, r5, asl #5 \n\t" + + "add r6, r1, r5, asr #8 \n\t" /* r6 = r = (Y >> 9) + rv */ + "add r2, r3, r5, asr #7 \n\t" /* r2 = g = (Y >> 8) + guv */ + "add r4, r0, r5, asr #8 \n\t" /* r4 = b = (Y >> 9) + bu */ + + "orr r5, r6, r4 \n\t" /* check if clamping is needed... */ + "orr r5, r5, r2, asr #1 \n\t" /* ...at all */ + "cmp r5, #31 \n\t" + "bls 15f \n\t" /* -> no clamp */ + "cmp r6, #31 \n\t" /* clamp r */ + "mvnhi r6, r6, asr #31 \n\t" + "andhi r6, r6, #31 \n\t" + "cmp r2, #63 \n\t" /* clamp g */ + "mvnhi r2, r2, asr #31 \n\t" + "andhi r2, r2, #63 \n\t" + "cmp r4, #31 \n\t" /* clamp b */ + "mvnhi r4, r4, asr #31 \n\t" + "andhi r4, r4, #31 \n\t" + "15: \n\t" /* no clamp */ + + "orr r4, r4, r2, lsl #5 \n\t" /* pixel = r<<11 | g<<5 | b */ + "orr r4, r4, r6, lsl #11 \n\t" + "mov r6, #0x30000000 \n\t" + "strh r4, [r6] \n\t" /* write pixel */ + + /* 2nd loop, second pixel */ + "ldrb r5, [%[ysrc]], #1 \n\t" /* r5 = *ysrc++ = *Y'_p++ */ + "sub r5, r5, #16 \n\t" /* r5 = (Y'-16) * 74 */ + "add r2, r5, r5, asl #2 \n\t" + "add r5, r2, r5, asl #5 \n\t" + + "add r6, r1, r5, asr #8 \n\t" /* r6 = r = (Y >> 9) + rv */ + "add r2, r3, r5, asr #7 \n\t" /* r2 = g = (Y >> 8) + guv */ + "add r4, r0, r5, asr #8 \n\t" /* r4 = b = (Y >> 9) + bu */ + + "orr r5, r6, r4 \n\t" /* check if clamping is needed... */ + "orr r5, r5, r2, asr #1 \n\t" /* ...at all */ + "cmp r5, #31 \n\t" + "bls 15f \n\t" /* -> no clamp */ + "cmp r6, #31 \n\t" /* clamp r */ + "mvnhi r6, r6, asr #31 \n\t" + "andhi r6, r6, #31 \n\t" + "cmp r2, #63 \n\t" /* clamp g */ + "mvnhi r2, r2, asr #31 \n\t" + "andhi r2, r2, #63 \n\t" + "cmp r4, #31 \n\t" /* clamp b */ + "mvnhi r4, r4, asr #31 \n\t" + "andhi r4, r4, #31 \n\t" + "15: \n\t" /* no clamp */ + + "orr r4, r4, r2, lsl #5 \n\t" /* pixel = r<<11 | g<<5 | b */ + "orr r4, r4, r6, lsl #11 \n\t" + "mov r6, #0x30000000 \n\t" + "strh r4, [r6] \n\t" /* write pixel */ + + "subs r7, r7, #2 \n\t" /* check for loop end */ + "bgt 20b \n\t" /* back to beginning */ + /* 2nd loop end */ + + : + : [usrc]"r"(usrc), [vsrc]"r"(vsrc), [ysrc]"r"(ysrc), [cnt]"r"(width), [stride]"r"(stride), [buf]"r"(chroma_buf) + : "cc", "r0", "r1", "r2", "r3", "r4", "r5", "r6", "r7" + ); + + ysrc += (stride ) - width; + usrc += (stride>>1) - (width>>1); + vsrc += (stride>>1) - (width>>1); + + } while (--height>0); - /* pixel2 */ - outw((red2 << 11) | (green2 << 5) | blue2, 0x30000000); - } - while (ysrc < row_end); - - src_y++; - } - /* Top-half of original lcd_bcm_finishup() function */ outw(0x31, 0x30030000);