From c703f2a3b302b5a42d7fab180832e896e62a7a46 Mon Sep 17 00:00:00 2001 From: Haozhun Jin Date: Fri, 20 Mar 2015 12:01:05 -0700 Subject: [PATCH] Fix char class casefold for certain chars When a character is less than or equal to single byte size (0xff), yet it takes more than 1 byte in the current encoding, the case folding code incorrectly put it in bitset instead of code range. As a result, for utf8 encoding, casefold works incorrectly on characters in range \u0080 to \u00ff (latin1 supplement). Before fix: * `"\u00c2"` `[\u00e0-\u00e5]` returns false * `"\u00c2"` `[\u00e2]` returns false * `"\u00c2"` `\u00e2` returns true --- src/org/joni/ApplyCaseFold.java | 2 +- test/org/joni/test/TestJava.java | 55 ++++++++++++++++++++++++++++++++ 2 files changed, 56 insertions(+), 1 deletion(-) create mode 100644 test/org/joni/test/TestJava.java diff --git a/src/org/joni/ApplyCaseFold.java b/src/org/joni/ApplyCaseFold.java index 7dd84ce1..6a8d1c35 100644 --- a/src/org/joni/ApplyCaseFold.java +++ b/src/org/joni/ApplyCaseFold.java @@ -41,7 +41,7 @@ public void apply(int from, int[]to, int length, Object o) { if (Config.CASE_FOLD_IS_APPLIED_INSIDE_NEGATIVE_CCLASS) { if ((inCC && !cc.isNot()) || (!inCC && cc.isNot())) { - if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE) { + if (enc.minLength() > 1 || to[0] >= BitSet.SINGLE_BYTE_SIZE || enc.codeToMbcLength(to[0]) > 1) { cc.addCodeRange(env, to[0], to[0]); } else { /* /(?i:[^A-C])/.match("a") ==> fail. */ diff --git a/test/org/joni/test/TestJava.java b/test/org/joni/test/TestJava.java new file mode 100644 index 00000000..7837852e --- /dev/null +++ b/test/org/joni/test/TestJava.java @@ -0,0 +1,55 @@ +/* + * Permission is hereby granted, free of charge, to any person obtaining a copy of + * this software and associated documentation files (the "Software"), to deal in + * the Software without restriction, including without limitation the rights to + * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies + * of the Software, and to permit persons to whom the Software is furnished to do + * so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +package org.joni.test; + +import org.jcodings.Encoding; +import org.jcodings.specific.UTF8Encoding; +import org.joni.Option; +import org.joni.Syntax; + +public class TestJava extends Test { + + public int option() { + return Option.DEFAULT; + } + + public Encoding encoding() { + return UTF8Encoding.INSTANCE; + } + + public String testEncoding() { + return "utf-8"; + } + + public Syntax syntax() { + return Syntax.Java; + } + + public void test() throws InterruptedException { + // test ignorecase for Latin-1 Supplement + x2s("[\\u00e0-\\u00e5]", "\u00c2", 0, 2, Option.IGNORECASE); + x2s("[\\u00e2]", "\u00c2", 0, 2, Option.IGNORECASE); + x2s("\\u00e2", "\u00c2", 0, 2, Option.IGNORECASE); + } + + public static void main(String[] args) throws Throwable { + new TestJava().run(); + } +}