From 46394865a1d4b801dd0925537f4fd19fd132d45e Mon Sep 17 00:00:00 2001 From: Kurt Thiemann Date: Tue, 29 Nov 2022 18:50:00 +0100 Subject: [PATCH] added support for decoding Java's weird modified UTF-8 encoding --- README.md | 12 ++ composer.json | 3 +- src/String/JavaEncoding.php | 139 +++++++++++++++++++++++ src/String/StringDataFormatException.php | 8 ++ src/Tag/StringTag.php | 23 ++++ 5 files changed, 184 insertions(+), 1 deletion(-) create mode 100644 src/String/JavaEncoding.php create mode 100644 src/String/StringDataFormatException.php diff --git a/README.md b/README.md index 2c11a51..d603bde 100644 --- a/README.md +++ b/README.md @@ -49,6 +49,18 @@ $myInt->setValue(42); echo $myInt->getValue(); // 42 ``` +On String tags, `getValue()` and `setValue()` will use the raw string data, which uses Java's modified +UTF-8 encoding. To use different encodings, +use `getDecodedValue($encoding = "UTF-8")` and `setDecodedValue($value, $encoding = "UTF-8")` instead. +A list of supported encodings is returned by the `mb_list_encodings()` function. + +```php +$myString new \Aternos\Nbt\Tag\StringTag(); + +$myString->setDecodedValue("Hello world!"); +echo $myString->getDecodedValue(); // Hello world! +``` + Compound tags, list tags, and array tags implement the `ArrayAccess`, `Countable`, and `Iterator` interfaces and can therefore be accessed as arrays. ```php diff --git a/composer.json b/composer.json index e3ebc55..d81e0fe 100644 --- a/composer.json +++ b/composer.json @@ -19,6 +19,7 @@ "php": ">=8.0", "php-64bit": "*", "ext-zlib": "*", - "ext-json": "*" + "ext-json": "*", + "ext-mbstring": "*" } } diff --git a/src/String/JavaEncoding.php b/src/String/JavaEncoding.php new file mode 100644 index 0000000..6fcaa5a --- /dev/null +++ b/src/String/JavaEncoding.php @@ -0,0 +1,139 @@ +> 0x06))); + $result .= chr(0x80 | (0x3F & $c)); + continue; + } + + if($c <= 0xFFFF) { + $result .= chr(0xE0 | (0x0F & ($c >> 0x0C))); + $result .= chr(0x80 | (0x3F & ($c >> 0x06))); + $result .= chr(0x80 | (0x3F & $c)); + continue; + } + + $result .= chr(0xED); + $result .= chr(0xA0 | (($c >> 0x10) & 0x0F)); + $result .= chr(0x80 | (($c >> 0x0A) & 0x3f)); + $result .= chr(0xED); + $result .= chr(0xb0 | (($c >> 0x06) & 0x0f)); + $result .= chr(0x80 | ($c & 0x3f)); + } + + return $result; + } + + /** + * @throws StringDataFormatException + */ + public function decode(string $string, string $outputEncoding = "UTF-8"): string + { + $result = ""; + for ($i = 0; $i < strlen($string); $i++) { + $a = ord($string[$i]); + + if ($a === 0) { + throw new StringDataFormatException("Invalid NULL byte in string"); + } + + // Single byte character + if (($a & 0b10000000) === 0b0) { + $result .= mb_chr($a, $outputEncoding); + continue; + } + + $b = ord($string[++$i] ?? "\0"); + + // Two byte character + if (($a & 0b11100000) === 0b11000000) { + if (($b & 0b11000000) !== 0b10000000) { + throw new StringDataFormatException("Invalid \"UTF-8\" sequence"); + } + + $result .= mb_chr((($a & 0x1F) << 6) | ($b & 0x3F), $outputEncoding); + continue; + } + + $c = ord($string[++$i] ?? "\0"); + + // Maybe six byte character + if ($a === 0b11101101 && ($b & 0b11110000) === 0b10100000 && ($c & 0b11000000) === 0b10000000) { + $d = ord($string[$i + 1] ?? "\0"); + $e = ord($string[$i + 2] ?? "\0"); + $f = ord($string[$i + 3] ?? "\0"); + + // Six byte character + if ($d === 0b11101101 && ($e & 0b11110000) === 0b10110000 && ($f & 0b11000000) === 0b10000000) { + $result .= mb_chr(0x10000 | + ($b & 0x0F) << 0x10 | + ($c & 0x3F) << 0x0A | + ($e & 0x0F) << 0x06 | + ($f & 0x3F), $outputEncoding); + + $i += 3; + continue; + } + } + + // Three byte character + if (($a & 0b11110000) === 0b11100000) { + if (($b & 0b11000000) !== 0b10000000 || ($c & 0b11000000) !== 0b10000000) { + throw new StringDataFormatException("Invalid \"UTF-8\" sequence"); + } + + $result .= mb_chr((($a & 0x0F) << 12) | (($b & 0x3F) << 6) | ($c & 0x3F), $outputEncoding); + continue; + } + + throw new StringDataFormatException("Invalid \"UTF-8\" sequence"); + } + return $result; + } +} \ No newline at end of file diff --git a/src/String/StringDataFormatException.php b/src/String/StringDataFormatException.php new file mode 100644 index 0000000..65e779e --- /dev/null +++ b/src/String/StringDataFormatException.php @@ -0,0 +1,8 @@ +value; } + /** + * @param string $encoding + * @return string + * @throws StringDataFormatException + */ + public function getDecodedValue(string $encoding = "UTF-8"): string + { + return JavaEncoding::getInstance()->decode($this->value, $encoding); + } + /** * @param string $value * @return StringTag @@ -30,6 +42,17 @@ public function setValue(string $value): StringTag return $this; } + /** + * @param string $value + * @param string $encoding + * @return StringTag + */ + public function setDecodedValue(string $value, string $encoding = "UTF-8"): StringTag + { + $this->value = JavaEncoding::getInstance()->encode($value, $encoding); + return $this; + } + /** * @return int */