Skip to content

Commit

Permalink
[librptext] conversion_iconv.cpp: Rework cpN_to_utf8() and cpN_to_utf…
Browse files Browse the repository at this point in the history
…16() to use a templated function internally.

The two functions are nearly identical except for the output type and
encoding.
  • Loading branch information
GerbilSoft committed Oct 24, 2024
1 parent 6605d38 commit 03668bd
Show file tree
Hide file tree
Showing 2 changed files with 67 additions and 125 deletions.
36 changes: 18 additions & 18 deletions src/librptext/conversion.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,11 +202,11 @@ typedef enum {
*
* The specified code page number will be used.
*
* @param cp [in] Code page number.
* @param str [in] 8-bit text.
* @param len [in] Length of str, in bytes. (-1 for NULL-terminated string)
* @param flags [in] Flags. (See TextConv_Flags_e.)
* @return UTF-8 string.
* @param cp [in] Code page number
* @param str [in] 8-bit text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @param flags [in] Flags (See TextConv_Flags_e)
* @return UTF-8 string
*/
RP_LIBROMDATA_PUBLIC
std::string cpN_to_utf8(unsigned int cp, const char *str, int len, unsigned int flags = 0);
Expand All @@ -215,10 +215,10 @@ std::string cpN_to_utf8(unsigned int cp, const char *str, int len, unsigned int
* Convert 8-bit text to UTF-8 using an RP-custom code page.
* Code page number must be CP_RP_*.
*
* @param cp [in] Code page number.
* @param str [in] 8-bit text.
* @param len [in] Length of str, in bytes. (-1 for NULL-terminated string)
* @return UTF-8 string.
* @param cp [in] Code page number
* @param str [in] 8-bit text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @return UTF-8 string
*/
std::string cpRP_to_utf8(unsigned int cp, const char *str, int len);

Expand All @@ -228,11 +228,11 @@ std::string cpRP_to_utf8(unsigned int cp, const char *str, int len);
*
* The specified code page number will be used.
*
* @param cp [in] Code page number.
* @param str [in] 8-bit text.
* @param len [in] Length of str, in bytes. (-1 for NULL-terminated string)
* @param flags [in] Flags. (See TextConv_Flags_e.)
* @return UTF-16 string.
* @param cp [in] Code page number
* @param str [in] 8-bit text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @param flags [in] Flags (See TextConv_Flags_e)
* @return UTF-16 string
*/
RP_LIBROMDATA_PUBLIC
std::u16string cpN_to_utf16(unsigned int cp, const char *str, int len, unsigned int flags = 0);
Expand All @@ -244,10 +244,10 @@ std::u16string cpN_to_utf16(unsigned int cp, const char *str, int len, unsigned
* The specified code page number will be used.
* Invalid characters will be ignored.
*
* @param cp [in] Code page number.
* @param str [in] UTF-8 text.
* @param len [in] Length of str, in bytes. (-1 for NULL-terminated string)
* @return 8-bit text.
* @param cp [in] Code page number
* @param str [in] UTF-8 text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @return 8-bit text
*/
RP_LIBROMDATA_PUBLIC
std::string utf8_to_cpN(unsigned int cp, const char *str, int len);
Expand Down
156 changes: 49 additions & 107 deletions src/librptext/conversion_iconv.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -190,24 +190,22 @@ static inline void codePageToEncName(char *enc_name, size_t len, unsigned int cp
}

/**
* Convert 8-bit text to UTF-8.
* Convert 8-bit text to UTF-8 or UTF-16.
* Trailing NULL bytes will be removed.
*
* The specified code page number will be used.
*
* @param cp [in] Code page number.
* @param str [in] 8-bit text.
* @param len [in] Length of str, in bytes. (-1 for NULL-terminated string)
* @param flags [in] Flags. (See TextConv_Flags_e.)
* @return UTF-8 string.
* @tparam T char (UTF-8) or char16_t (UTF-16)
* @param encoding [out] Output encoding
* @param cp [in] Code page number
* @param str [in] 8-bit text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @param flags [in] Flags (See TextConv_Flags_e)
* @return UTF-8 string
*/
string cpN_to_utf8(unsigned int cp, const char *str, int len, unsigned int flags)
template<typename T>
std::basic_string<T> T_cpN_to_unicode(const char *out_encoding, unsigned int cp, const char *str, int len, unsigned int flags)
{
if (cp & CP_RP_BASE) {
// RP-custom code page.
return cpRP_to_utf8(cp, str, len);
}

len = check_NULL_terminator(str, len);

// Get the encoding name for the primary code page.
Expand All @@ -221,8 +219,8 @@ string cpN_to_utf8(unsigned int cp, const char *str, int len, unsigned int flags
// Attempt to convert the text to UTF-8.
// NOTE: "//IGNORE" sometimes doesn't work, so we won't
// check for TEXTCONV_FLAG_CP1252_FALLBACK here.
string ret;
char *mbs =nullptr;
std::basic_string<T> ret;
T *out_str = nullptr;

if ((flags & TEXTCONV_FLAG_JIS_X_0208) && len >= 1) {
// Check if the string might be JIS X 0208.
Expand Down Expand Up @@ -250,32 +248,32 @@ string cpN_to_utf8(unsigned int cp, const char *str, int len, unsigned int flags
for (char &c : eucJP) {
c |= 0x80;
}
mbs = reinterpret_cast<char*>(rp_iconv((char*)eucJP.data(), eucJP.size(), "EUC-JP", "UTF-8", ignoreErr));
out_str = reinterpret_cast<T*>(rp_iconv((char*)eucJP.data(), eucJP.size(), "EUC-JP", out_encoding, ignoreErr));
}
}

if (!mbs) {
if (!out_str) {
// Standard string conversion
mbs = reinterpret_cast<char*>(rp_iconv((char*)str, len*sizeof(*str), cp_name, "UTF-8", ignoreErr));
out_str = reinterpret_cast<T*>(rp_iconv((char*)str, len*sizeof(*str), cp_name, out_encoding, ignoreErr));
}

if (!mbs /*&& (flags & TEXTCONV_FLAG_CP1252_FALLBACK)*/) {
if (!out_str /*&& (flags & TEXTCONV_FLAG_CP1252_FALLBACK)*/) {
// Try cp1252 fallback.
// NOTE: Sometimes cp1252 fails, even with ignore set.
if (cp != 1252) {
mbs = reinterpret_cast<char*>(rp_iconv((char*)str, len*sizeof(*str), "CP1252", "UTF-8", true));
out_str = reinterpret_cast<T*>(rp_iconv((char*)str, len*sizeof(*str), "CP1252", out_encoding, true));
}
if (!mbs) {
if (!out_str) {
// Try Latin-1 fallback.
if (cp != CP_LATIN1) {
mbs = reinterpret_cast<char*>(rp_iconv((char*)str, len*sizeof(*str), "LATIN1", "UTF-8", true));
out_str = reinterpret_cast<T*>(rp_iconv((char*)str, len*sizeof(*str), "LATIN1", out_encoding, true));
}
}
}

if (mbs) {
ret.assign(mbs);
free(mbs);
if (out_str) {
ret.assign(out_str);
free(out_str);

#ifdef HAVE_ICONV_LIBICONV
if (cp == CP_SJIS) {
Expand All @@ -298,98 +296,42 @@ string cpN_to_utf8(unsigned int cp, const char *str, int len, unsigned int flags
}

/**
* Convert 8-bit text to UTF-16.
* Convert 8-bit text to UTF-8.
* Trailing NULL bytes will be removed.
*
* The specified code page number will be used.
*
* @param cp [in] Code page number.
* @param str [in] 8-bit text.
* @param len [in] Length of str, in bytes. (-1 for NULL-terminated string)
* @param flags [in] Flags. (See TextConv_Flags_e.)
* @return UTF-16 string.
* @param cp [in] Code page number
* @param str [in] 8-bit text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @param flags [in] Flags. (See TextConv_Flags_e)
* @return UTF-8 string
*/
u16string cpN_to_utf16(unsigned int cp, const char *str, int len, unsigned int flags)
string cpN_to_utf8(unsigned int cp, const char *str, int len, unsigned int flags)
{
len = check_NULL_terminator(str, len);

// Get the encoding name for the primary code page.
char cp_name[20];
codePageToEncName(cp_name, sizeof(cp_name), cp);

// If we *want* to fall back to cp1252 on error,
// then the first conversion should fail on errors.
const bool ignoreErr = !(flags & TEXTCONV_FLAG_CP1252_FALLBACK);

// Attempt to convert the text to UTF-16.
// NOTE: "//IGNORE" sometimes doesn't work, so we won't
// check for TEXTCONV_FLAG_CP1252_FALLBACK here.
u16string ret;
char16_t *wcs = nullptr;

if ((flags & TEXTCONV_FLAG_JIS_X_0208) && len >= 1) {
// Check if the string might be JIS X 0208.
// If it is, make it EUC-JP compatible, then convert it.
bool is0208 = false;
// Heuristic: First character should be 0x21-0x24.
if (*str >= 0x21 && *str <= 0x24) {
is0208 = true;
const char *const p_end = str + len;
for (const char *p = str + 1; p < p_end; p++) {
const uint8_t chr = static_cast<uint8_t>(*p);
if (chr == 0) {
// End of string
break;
} else if (chr & 0x80) {
// High bit cannot be set
is0208 = false;
}
}
}

if (is0208) {
// Make the string EUC-JP compatible.
string eucJP(str, 0, len);
for (char &c : eucJP) {
c |= 0x80;
}
wcs = reinterpret_cast<char16_t*>(rp_iconv((char*)eucJP.data(), eucJP.size(), "EUC-JP", RP_ICONV_UTF16_ENCODING, ignoreErr));
}
}

if (!wcs) {
// Standard string conversion
wcs = reinterpret_cast<char16_t*>(rp_iconv((char*)str, len*sizeof(*str), cp_name, RP_ICONV_UTF16_ENCODING, ignoreErr));
}

if (!wcs /*&& (flags & TEXTCONV_FLAG_CP1252_FALLBACK)*/) {
// Try cp1252 fallback.
// NOTE: Sometimes cp1252 fails, even with ignore set.
if (cp != 1252) {
wcs = reinterpret_cast<char16_t*>(rp_iconv((char*)str, len*sizeof(*str), "CP1252", RP_ICONV_UTF16_ENCODING, true));
}
if (!wcs) {
// Try Latin-1 fallback.
if (cp != CP_LATIN1) {
wcs = reinterpret_cast<char16_t*>(rp_iconv((char*)str, len*sizeof(*str), "LATIN1//IGNORE", RP_ICONV_UTF16_ENCODING, true));
}
}
if (cp & CP_RP_BASE) {
// RP-custom code page.
return cpRP_to_utf8(cp, str, len);
}

if (wcs) {
ret.assign(wcs);
free(wcs);
return T_cpN_to_unicode<char>("UTF-8", cp, str, len, flags);
}

#ifdef HAVE_ICONV_LIBICONV
if (cp == CP_SJIS) {
// libiconv's cp932 maps Shift-JIS 8160 (Wave Dash) to U+301C.
// This is expected behavior for Shift-JIS, but cp932 should
// map it to U+FF5E.
std::replace(ret.begin(), ret.end(), (char16_t)0x301C, (char16_t)0xFF5E);
}
#endif /* HAVE_ICONV_LIBICONV */
}
return ret;
/**
* Convert 8-bit text to UTF-16.
* Trailing NULL bytes will be removed.
*
* The specified code page number will be used.
*
* @param cp [in] Code page number
* @param str [in] 8-bit text
* @param len [in] Length of str, in bytes (-1 for NULL-terminated string)
* @param flags [in] Flags (See TextConv_Flags_e)
* @return UTF-16 string
*/
u16string cpN_to_utf16(unsigned int cp, const char *str, int len, unsigned int flags)
{
return T_cpN_to_unicode<char16_t>(RP_ICONV_UTF16_ENCODING, cp, str, len, flags);
}

/**
Expand Down

0 comments on commit 03668bd

Please sign in to comment.