From f230095c26852259245aeae9c397c0cc59afe665 Mon Sep 17 00:00:00 2001 From: Sean Bartell Date: Mon, 22 Aug 2011 14:21:10 -0400 Subject: [PATCH 3/7] rbcodec refactoring: unicode --- apps/rbcodecconfig.h | 1 + apps/rbcodecplatform.h | 23 ++++++++++++++ lib/rbcodec/SOURCES | 3 ++ lib/rbcodec/metadata/id3tags.c | 23 ++++++-------- lib/rbcodec/metadata/sid.c | 6 ++-- lib/rbcodec/metadata/smaf.c | 57 ++++++++++++++++-------------------- lib/rbcodec/metadata/spc.c | 8 ++-- lib/rbcodec/metadata/vgm.c | 4 +- lib/rbcodec/metadata/wave.c | 17 ++--------- lib/rbcodec/platform.h | 2 + lib/rbcodec/rbcodecplatform-unix.h | 36 ++++++++++++++++++++++ 11 files changed, 112 insertions(+), 68 deletions(-) diff --git a/apps/rbcodecconfig.h b/apps/rbcodecconfig.h index 7e1225f..0c54ad7 100644 --- a/apps/rbcodecconfig.h +++ b/apps/rbcodecconfig.h @@ -19,5 +19,6 @@ #endif #define HAVE_ROCKBOX_FIXEDPOINT +#define HAVE_ROCKBOX_UNICODE #endif diff --git a/apps/rbcodecplatform.h b/apps/rbcodecplatform.h index 21bf365..757380a 100644 --- a/apps/rbcodecplatform.h +++ b/apps/rbcodecplatform.h @@ -20,4 +20,27 @@ /* logf */ #include "logf.h" +/* Character encoding conversion */ +#include "rbunicode.h" +typedef enum codepages encoding_t; +#define ENCODING_DEFAULT -1 +#define ENCODING_ISO_8859_1 ISO_8859_1 +#define ENCODING_SJIS SJIS +#define ENCODING_EUC_CN GB_2312 +#define ENCODING_BIG_5 BIG_5 +#define ENCODING_UTF_8 UTF_8 +#define ENCODING_NONE NUM_CODEPAGES +#define ENCODING_UTF_16BE NUM_CODEPAGES+1 +#define ENCODING_UTF_16LE NUM_CODEPAGES+2 +static inline char *decode_text(encoding_t encoding, const char *in, char *out, + size_t in_len) +{ + if (encoding == ENCODING_UTF_16BE) + return utf16BEdecode(in, out, in_len); + else if (encoding == ENCODING_UTF_16LE) + return utf16LEdecode(in, out, in_len); + else + return iso_decode(in, out, encoding, in_len); +} + #endif diff --git a/lib/rbcodec/SOURCES b/lib/rbcodec/SOURCES index 641c868..52957a3 100644 --- a/lib/rbcodec/SOURCES +++ b/lib/rbcodec/SOURCES @@ -2,6 +2,9 @@ metadata/metadata.c metadata/id3tags.c metadata/mp3.c metadata/mp3data.c +#ifndef HAVE_ROCKBOX_UNICODE +util/unicode.c +#endif #ifndef HAVE_ROCKBOX_FIXEDPOINT util/fixedpoint.c #endif diff --git a/lib/rbcodec/metadata/id3tags.c b/lib/rbcodec/metadata/id3tags.c index cd05e54..3aeb13e 100644 --- a/lib/rbcodec/metadata/id3tags.c +++ b/lib/rbcodec/metadata/id3tags.c @@ -547,7 +547,7 @@ static int unicode_munge(char* string, char* utf8buf, int *len) { case 0x00: /* Type 0x00 is ordinary ISO 8859-1 */ str++; (*len)--; - utf8 = iso_decode(str, utf8, -1, *len); + utf8 = decode_text(ENCODING_DEFAULT, str, utf8, *len); *utf8 = 0; *len = (unsigned long)utf8 - (unsigned long)utf8buf; break; @@ -580,19 +580,16 @@ static int unicode_munge(char* string, char* utf8buf, int *len) { le = true; do { - if(le) - utf8 = utf16LEdecode(str, utf8, 1); - else - utf8 = utf16BEdecode(str, utf8, 1); - - str+=2; i += 2; - } while((str[0] || str[1]) && (i < *len)); - + } while((str[i] || str[i+1]) && (i < *len)); + if(le) + utf8 = decode_text(ENCODING_UTF_16LE, str, utf8, i); + else + utf8 = decode_text(ENCODING_UTF_16BE, str, utf8, i); *utf8++ = 0; /* Terminate the string */ templen += (strlen(&utf8buf[templen]) + 1); - str += 2; i+=2; + str += i; } while(i < *len); *len = templen - 1; break; @@ -604,7 +601,7 @@ static int unicode_munge(char* string, char* utf8buf, int *len) { break; default: /* Plain old string */ - utf8 = iso_decode(str, utf8, -1, *len); + utf8 = decode_text(ENCODING_DEFAULT, str, utf8, *len); *utf8 = 0; *len = (unsigned long)utf8 - (unsigned long)utf8buf; break; @@ -651,7 +648,7 @@ bool setid3v1title(int fd, struct mp3entry *entry) ptr[j] = 0; /* convert string to utf8 */ utf8 = (unsigned char *)entry->id3v1buf[i]; - utf8 = iso_decode(ptr, utf8, -1, 30); + utf8 = decode_text(ENCODING_DEFAULT, ptr, utf8, 30); /* make sure string is terminated */ *utf8 = 0; break; @@ -662,7 +659,7 @@ bool setid3v1title(int fd, struct mp3entry *entry) ptr[j] = 0; /* convert string to utf8 */ utf8 = (unsigned char *)entry->id3v1buf[3]; - utf8 = iso_decode(ptr, utf8, -1, 28); + utf8 = decode_text(ENCODING_DEFAULT, ptr, utf8, 28); /* make sure string is terminated */ *utf8 = 0; break; diff --git a/lib/rbcodec/metadata/sid.c b/lib/rbcodec/metadata/sid.c index 3a276b3..3a52eb8 100644 --- a/lib/rbcodec/metadata/sid.c +++ b/lib/rbcodec/metadata/sid.c @@ -55,12 +55,12 @@ bool get_sid_metadata(int fd, struct mp3entry* id3) /* Copy Title (assumed max 0x1f letters + 1 zero byte) */ id3->title = p; buf[0x16+0x1f] = 0; - p = iso_decode(&buf[0x16], p, 0, strlen(&buf[0x16])+1); + p = decode_text(ENCODING_ISO_8859_1, &buf[0x16], p, 0x20); /* Copy Artist (assumed max 0x1f letters + 1 zero byte) */ id3->artist = p; buf[0x36+0x1f] = 0; - p = iso_decode(&buf[0x36], p, 0, strlen(&buf[0x36])+1); + p = decode_text(ENCODING_ISO_8859_1, &buf[0x36], p, 0x20); /* Copy Year (assumed max 4 letters + 1 zero byte) */ buf[0x56+0x4] = 0; @@ -69,7 +69,7 @@ bool get_sid_metadata(int fd, struct mp3entry* id3) /* Copy Album (assumed max 0x1f-0x05 letters + 1 zero byte) */ id3->album = p; buf[0x56+0x1f] = 0; - iso_decode(&buf[0x5b], p, 0, strlen(&buf[0x5b])+1); + decode_text(ENCODING_ISO_8859_1, &buf[0x5b], p, 0x20); id3->bitrate = 706; id3->frequency = 44100; diff --git a/lib/rbcodec/metadata/smaf.c b/lib/rbcodec/metadata/smaf.c index aeeadb4..2937929 100644 --- a/lib/rbcodec/metadata/smaf.c +++ b/lib/rbcodec/metadata/smaf.c @@ -33,17 +33,14 @@ static const int basebits[4] = { 4, 8, 12, 16 }; static const int frequency[5] = { 4000, 8000, 11025, 22050, 44100 }; -static const int support_codepages[5] = { +static const encoding_t support_codepages[5] = { #ifdef HAVE_LCD_BITMAP - SJIS, ISO_8859_1, -1, GB_2312, BIG_5, + ENCODING_SJIS, ENCODING_ISO_8859_1, ENCODING_NONE, ENCODING_EUC_CN, ENCODING_BIG_5, #else - -1, ISO_8859_1, -1, -1, -1, + ENCODING_NONE, ENCODING_ISO_8859_1, ENCODING_NONE, ENCODING_NONE, ENCODING_NONE, #endif }; -/* extra codepage */ -#define UCS2 (NUM_CODEPAGES + 1) - /* support id3 tag */ #define TAG_TITLE (('S'<<8)|'T') #define TAG_ARTIST (('A'<<8)|'N') @@ -67,17 +64,17 @@ static inline int convert_smaf_audio_frequency(unsigned int freq) return frequency[freq]; } -static int convert_smaf_codetype(unsigned int codetype) +static encoding_t convert_smaf_codetype(unsigned int codetype) { if (codetype < 5) return support_codepages[codetype]; else if (codetype == 0x20 || codetype == 0x24) /* In Rockbox, UCS2 and UTF-16 are same. */ - return UCS2; + return ENCODING_UTF_16BE; else if (codetype == 0x23) - return UTF_8; + return ENCODING_UTF_8; else if (codetype == 0xff) - return ISO_8859_1; - return -1; + return ENCODING_ISO_8859_1; + return ENCODING_NONE; } static void set_length(struct mp3entry *id3, unsigned int ch, unsigned int basebit, @@ -111,26 +108,22 @@ static void set_length(struct mp3entry *id3, unsigned int ch, unsigned int baseb /* contents parse functions */ /* Note: - * 1) When the codepage is UTF-8 or UCS2, contents data do not start BOM. + * 1) When the codepage is UTF-8 or UTF-16, contents data do not start BOM. * 2) The byte order of contents data is big endian. */ static void decode2utf8(const unsigned char *src, unsigned char **dst, - int srcsize, int *dstsize, int codepage) + int srcsize, int *dstsize, encoding_t codepage) { unsigned char tmpbuf[srcsize * 3 + 1]; unsigned char *p; int utf8size; - if (codepage < NUM_CODEPAGES) - p = iso_decode(src, tmpbuf, codepage, srcsize); - else /* codepage == UCS2 */ - p = utf16BEdecode(src, tmpbuf, srcsize); - - *p = '\0'; + p = decode_text(codepage, src, tmpbuf, srcsize); + *p++ = '\0'; strlcpy(*dst, tmpbuf, *dstsize); - utf8size = (p - tmpbuf) + 1; + utf8size = p - tmpbuf; if (utf8size > *dstsize) { DEBUGF("metadata warning: data length: %d > contents store buffer size: %d\n", @@ -141,8 +134,8 @@ static void decode2utf8(const unsigned char *src, unsigned char **dst, *dstsize -= utf8size; } -static int read_audio_track_contets(int fd, int codepage, unsigned char **dst, - int *dstsize) +static int read_audio_track_contets(int fd, encoding_t codepage, + unsigned char **dst, int *dstsize) { /* value length <= 256 bytes */ unsigned char buf[256]; @@ -155,7 +148,7 @@ static int read_audio_track_contets(int fd, int codepage, unsigned char **dst, while (p - buf < 256 && *p != ',') { /* skip yen mark */ - if (codepage != UCS2) + if (codepage != ENCODING_UTF_16BE) { if (*p == '\\') p++; @@ -165,13 +158,13 @@ static int read_audio_track_contets(int fd, int codepage, unsigned char **dst, if (*p > 0x7f) { - if (codepage == UTF_8) + if (codepage == ENCODING_UTF_8) { - while ((*p & MASK) != COMP) + while ((*p & 0xc0) != 0x80) *q++ = *p++; } #ifdef HAVE_LCD_BITMAP - else if (codepage == SJIS) + else if (codepage == ENCODING_SJIS) { if (*p <= 0xa0 || *p >= 0xe0) *q++ = *p++; @@ -180,7 +173,7 @@ static int read_audio_track_contets(int fd, int codepage, unsigned char **dst, } *q++ = *p++; - if (codepage == UCS2) + if (codepage == ENCODING_UTF_16BE) *q++ = *p++; } datasize = p - buf + 1; @@ -192,7 +185,7 @@ static int read_audio_track_contets(int fd, int codepage, unsigned char **dst, return datasize; } -static void read_score_track_contets(int fd, int codepage, int datasize, +static void read_score_track_contets(int fd, encoding_t codepage, int datasize, unsigned char **dst, int *dstsize) { unsigned char buf[datasize]; @@ -231,12 +224,12 @@ static bool parse_smaf_audio_track(int fd, struct mp3entry *id3, unsigned int da unsigned int chunksize = datasize; int valsize; - int codepage; + encoding_t codepage; /* parse contents info */ read(fd, tmp, 5); codepage = convert_smaf_codetype(tmp[2]); - if (codepage < 0) + if (codepage == ENCODING_NONE) { DEBUGF("metadata error: smaf unsupport codetype: %d\n", tmp[2]); return false; @@ -327,7 +320,7 @@ static bool parse_smaf_score_track(int fd, struct mp3entry *id3) unsigned int datasize; int valsize; - int codepage; + encoding_t codepage; /* parse Optional Data Chunk */ read(fd, tmp, 21); @@ -348,7 +341,7 @@ static bool parse_smaf_score_track(int fd, struct mp3entry *id3) } codepage = convert_smaf_codetype(tmp[16]); - if (codepage < 0) + if (codepage == ENCODING_NONE) { DEBUGF("metadata error: smaf unsupport codetype: %d\n", tmp[16]); return false; diff --git a/lib/rbcodec/metadata/spc.c b/lib/rbcodec/metadata/spc.c index 6d4590d..cabaac9 100644 --- a/lib/rbcodec/metadata/spc.c +++ b/lib/rbcodec/metadata/spc.c @@ -54,17 +54,17 @@ bool get_spc_metadata(int fd, struct mp3entry* id3) id3->title = p; buf[31] = 0; - p = iso_decode(buf, p, 0, 32); + p = decode_text(ENCODING_ISO_8859_1, buf, p, 32); buf += 32; id3->album = p; buf[31] = 0; - p = iso_decode(buf, p, 0, 32); + p = decode_text(ENCODING_ISO_8859_1, buf, p, 32); buf += 48; id3->comment = p; buf[31] = 0; - p = iso_decode(buf, p, 0, 32); + p = decode_text(ENCODING_ISO_8859_1, buf, p, 32); buf += 32; /* Date check */ @@ -114,7 +114,7 @@ bool get_spc_metadata(int fd, struct mp3entry* id3) id3->artist = p; buf[31] = 0; - iso_decode(buf, p, 0, 32); + p = decode_text(ENCODING_ISO_8859_1, buf, p, 32); if (length==0) { length=3*60*1000; /* 3 minutes */ diff --git a/lib/rbcodec/metadata/vgm.c b/lib/rbcodec/metadata/vgm.c index 7d3f45e..17a3afc 100644 --- a/lib/rbcodec/metadata/vgm.c +++ b/lib/rbcodec/metadata/vgm.c @@ -58,8 +58,8 @@ static byte const* get_gd3_str( byte const* in, byte const* end, char* field ) len = len < (int) max_field ? len : (int) max_field; field [len] = 0; - /* Conver to utf8 */ - utf16LEdecode( in, field, len ); + /* Convert to utf8 */ + decode_text(ENCODING_UTF_16LE, in, field, len); /* Copy string back to id3v2buf */ strcpy( (char*) in, field ); diff --git a/lib/rbcodec/metadata/wave.c b/lib/rbcodec/metadata/wave.c index 944c112..c431e64 100644 --- a/lib/rbcodec/metadata/wave.c +++ b/lib/rbcodec/metadata/wave.c @@ -134,17 +134,6 @@ struct wave_fmt { uint64_t numbytes; }; -static unsigned char *convert_utf8(const unsigned char *src, unsigned char *dst, - int size, bool is_64) -{ - if (is_64) - { - /* Note: wave64: metadata codepage is UTF-16 only */ - return utf16LEdecode(src, dst, size); - } - return iso_decode(src, dst, -1, size); -} - static void set_totalsamples(struct wave_fmt *fmt, struct mp3entry* id3) { switch (fmt->formattag) @@ -247,6 +236,7 @@ static void parse_list_chunk(int fd, struct mp3entry* id3, int chunksize, bool i unsigned char *endp; unsigned char *data_pos; unsigned char *tag_pos = id3->id3v2buf; + encoding_t encoding = is_64 ? ENCODING_UTF_16LE : ENCODING_DEFAULT; int datasize; int infosize; int remain; @@ -278,9 +268,8 @@ static void parse_list_chunk(int fd, struct mp3entry* id3, int chunksize, bool i if (memcmp(bp, info_chunks[i].tag, 4) == 0) { *((char **)(((char*)id3) + info_chunks[i].offset)) = tag_pos; - tag_pos = convert_utf8(data_pos, tag_pos, - (datasize + 1 >= remain )? remain - 1 : datasize, - is_64); + tag_pos = decode_text(encoding, data_pos, tag_pos, + (datasize + 1 >= remain )? remain - 1 : datasize); *tag_pos++ = 0; break; } diff --git a/lib/rbcodec/platform.h b/lib/rbcodec/platform.h index 19532a6..5bb4db6 100644 --- a/lib/rbcodec/platform.h +++ b/lib/rbcodec/platform.h @@ -8,6 +8,8 @@ size_t strlcpy(char *dst, const char *src, size_t siz); #endif +unsigned char* utf8encode(unsigned long ucs, unsigned char *utf8); + #ifndef ARRAYLEN #define ARRAYLEN(a) (sizeof(a) / sizeof((a)[0])) #endif diff --git a/lib/rbcodec/rbcodecplatform-unix.h b/lib/rbcodec/rbcodecplatform-unix.h index 6a40d3d..80796b9 100644 --- a/lib/rbcodec/rbcodecplatform-unix.h +++ b/lib/rbcodec/rbcodecplatform-unix.h @@ -51,4 +51,40 @@ static inline off_t filesize(int fd) { } while (0) #endif +/* Character encoding conversion */ +typedef enum { + ENCODING_NONE, + ENCODING_BIG_5, + ENCODING_DEFAULT, + ENCODING_EUC_CN, + ENCODING_ISO_8859_1, + ENCODING_SJIS, + ENCODING_UTF_16BE, + ENCODING_UTF_16LE, + ENCODING_UTF_8, +} encoding_t; + +#include +static inline char *decode_text(encoding_t encoding, const char *in, char *out, size_t in_len) +{ + const char *encoding_name; + size_t inbytesleft = in_len, outbytesleft = 4 * in_len; + + switch (encoding) { + case ENCODING_BIG_5: encoding_name = "BIG5"; break; + case ENCODING_EUC_CN: encoding_name = "EUC-CN"; break; + case ENCODING_ISO_8859_1: encoding_name = "ISO-8859-1"; break; + case ENCODING_SJIS: encoding_name = "SJIS"; break; + case ENCODING_UTF_16BE: encoding_name = "UTF-16BE"; break; + case ENCODING_UTF_16LE: encoding_name = "UTF-16LE"; break; + default: /* fallthrough */ + case ENCODING_UTF_8: encoding_name = "UTF-8"; break; + } + + iconv_t cd = iconv_open("UTF-8", encoding_name); + iconv(cd, (char**)&in, &inbytesleft, &out, &outbytesleft); + iconv_close(cd); + return out + 4 * in_len - outbytesleft; +} + #endif -- 1.7.6