Skip to content

Commit

Permalink
Merge pull request #2071 from natalie-lang/case-map-turkic
Browse files Browse the repository at this point in the history
  • Loading branch information
seven1m authored Jun 6, 2024
2 parents 4253efb + 91f7200 commit 0617d89
Show file tree
Hide file tree
Showing 6 changed files with 71 additions and 88 deletions.
14 changes: 11 additions & 3 deletions include/natalie/encoding_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,14 @@

namespace Natalie {

enum CaseMapType {
CaseMapFull = 0,
CaseMapAscii = 1,
CaseMapTurkicAzeri = 2,
CaseMapLithuanian = 4,
CaseMapFold = 8,
};

const int SPECIAL_CASE_LOWER_MAX_SIZE = 2;
const int SPECIAL_CASE_TITLE_MAX_SIZE = 3;
const int SPECIAL_CASE_UPPER_MAX_SIZE = 3;
Expand Down Expand Up @@ -96,9 +104,9 @@ class EncodingObject : public Object {
static EncodingObject *find_encoding(Env *env, Value encoding);

// must pass a buffer of nat_int_t to this function; uint8_t return is number of codepoints written
static uint8_t codepoint_to_lowercase(nat_int_t codepoint, nat_int_t result[], bool ascii_only = false);
static uint8_t codepoint_to_uppercase(nat_int_t codepoint, nat_int_t result[], bool ascii_only = false);
static uint8_t codepoint_to_titlecase(nat_int_t codepoint, nat_int_t result[], bool ascii_only = false);
static uint8_t codepoint_to_lowercase(nat_int_t codepoint, nat_int_t result[], CaseMapType flags = CaseMapFull);
static uint8_t codepoint_to_uppercase(nat_int_t codepoint, nat_int_t result[], CaseMapType flags = CaseMapFull);
static uint8_t codepoint_to_titlecase(nat_int_t codepoint, nat_int_t result[], CaseMapType flags = CaseMapFull);

static void init_special_casing_map();
static SpecialCasingEntry find_special_casing_map_entry(nat_int_t codepoint);
Expand Down
19 changes: 5 additions & 14 deletions include/natalie/string_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,20 +16,11 @@ namespace Natalie {

using namespace TM;

enum CaseFoldType {
Ascii = 1,
FoldTurkicAzeri = 2,
FoldLithuanian = 4,
Upcase = 8,
Downcase = 16,
Fold = 32
};

inline CaseFoldType operator|(CaseFoldType a, CaseFoldType b) {
return static_cast<CaseFoldType>(static_cast<int>(a) | static_cast<int>(b));
inline CaseMapType operator|(CaseMapType a, CaseMapType b) {
return static_cast<CaseMapType>(static_cast<int>(a) | static_cast<int>(b));
}
inline CaseFoldType operator^(CaseFoldType a, CaseFoldType b) {
return static_cast<CaseFoldType>(static_cast<int>(a) ^ static_cast<int>(b));
inline CaseMapType operator^(CaseMapType a, CaseMapType b) {
return static_cast<CaseMapType>(static_cast<int>(a) ^ static_cast<int>(b));
}

class StringObject : public Object {
Expand Down Expand Up @@ -404,7 +395,7 @@ class StringObject : public Object {
size_t char_index_to_byte_index(size_t) const;
size_t byte_index_to_char_index(size_t) const;

static CaseFoldType check_case_options(Env *env, Value arg1, Value arg2, CaseFoldType flags);
static CaseMapType check_case_options(Env *env, Value arg1, Value arg2, bool downcase = false);

unsigned char at(size_t index) const {
return m_string.at(index);
Expand Down
36 changes: 12 additions & 24 deletions spec/core/string/capitalize_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -44,15 +44,11 @@

describe "full Unicode case mapping adapted for Turkic languages" do
it "capitalizes ASCII characters according to Turkic semantics" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
"iSa".capitalize(:turkic).should == "İsa"
end
"iSa".capitalize(:turkic).should == "İsa"
end

it "allows Lithuanian as an extra option" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
"iSa".capitalize(:turkic, :lithuanian).should == "İsa"
end
"iSa".capitalize(:turkic, :lithuanian).should == "İsa"
end

it "does not allow any other additional option" do
Expand All @@ -66,9 +62,7 @@
end

it "allows Turkic as an extra option (and applies Turkic semantics)" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
"iß".capitalize(:lithuanian, :turkic).should == "İß"
end
"iß".capitalize(:lithuanian, :turkic).should == "İß"
end

it "does not allow any other additional option" do
Expand Down Expand Up @@ -153,19 +147,15 @@

describe "modifies self in place for full Unicode case mapping adapted for Turkic languages" do
it "capitalizes ASCII characters according to Turkic semantics" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
a = "iSa"
a.capitalize!(:turkic)
a.should == "İsa"
end
a = "iSa"
a.capitalize!(:turkic)
a.should == "İsa"
end

it "allows Lithuanian as an extra option" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
a = "iSa"
a.capitalize!(:turkic, :lithuanian)
a.should == "İsa"
end
a = "iSa"
a.capitalize!(:turkic, :lithuanian)
a.should == "İsa"
end

it "does not allow any other additional option" do
Expand All @@ -181,11 +171,9 @@
end

it "allows Turkic as an extra option (and applies Turkic semantics)" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
a = "iß"
a.capitalize!(:lithuanian, :turkic)
a.should == "İß"
end
a = "iß"
a.capitalize!(:lithuanian, :turkic)
a.should == "İß"
end

it "does not allow any other additional option" do
Expand Down
24 changes: 6 additions & 18 deletions spec/core/string/upcase_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -39,15 +39,11 @@

describe "full Unicode case mapping adapted for Turkic languages" do
it "upcases ASCII characters according to Turkic semantics" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
"i".upcase(:turkic).should == "İ"
end
"i".upcase(:turkic).should == "İ"
end

it "allows Lithuanian as an extra option" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
"i".upcase(:turkic, :lithuanian).should == "İ"
end
"i".upcase(:turkic, :lithuanian).should == "İ"
end

it "does not allow any other additional option" do
Expand All @@ -61,9 +57,7 @@
end

it "allows Turkic as an extra option (and applies Turkic semantics)" do
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
"iß".upcase(:lithuanian, :turkic).should == "İSS"
end
"iß".upcase(:lithuanian, :turkic).should == "İSS"
end

it "does not allow any other additional option" do
Expand Down Expand Up @@ -139,17 +133,13 @@
it "upcases ASCII characters according to Turkic semantics" do
a = "i"
a.upcase!(:turkic)
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
a.should == "İ"
end
a.should == "İ"
end

it "allows Lithuanian as an extra option" do
a = "i"
a.upcase!(:turkic, :lithuanian)
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
a.should == "İ"
end
a.should == "İ"
end

it "does not allow any other additional option" do
Expand All @@ -167,9 +157,7 @@
it "allows Turkic as an extra option (and applies Turkic semantics)" do
a = "iß"
a.upcase!(:lithuanian, :turkic)
NATFIXME 'Pending unicode casemap support', exception: SpecFailedException do
a.should == "İSS"
end
a.should == "İSS"
end

it "does not allow any other additional option" do
Expand Down
23 changes: 17 additions & 6 deletions src/encoding_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -244,8 +244,8 @@ void EncodingObject::initialize_defaults(Env *env) {
s_filesystem = s_default_external;
}

uint8_t EncodingObject::codepoint_to_lowercase(nat_int_t codepoint, nat_int_t result[], bool ascii_only) {
if (ascii_only) {
uint8_t EncodingObject::codepoint_to_lowercase(nat_int_t codepoint, nat_int_t result[], CaseMapType flags) {
if (flags & CaseMapAscii) {
if (codepoint >= 'A' && codepoint <= 'Z')
result[0] = codepoint + 32;
else
Expand Down Expand Up @@ -277,15 +277,20 @@ uint8_t EncodingObject::codepoint_to_lowercase(nat_int_t codepoint, nat_int_t re
return 1;
}

uint8_t EncodingObject::codepoint_to_uppercase(nat_int_t codepoint, nat_int_t result[], bool ascii_only) {
if (ascii_only) {
uint8_t EncodingObject::codepoint_to_uppercase(nat_int_t codepoint, nat_int_t result[], CaseMapType flags) {
if (flags & CaseMapAscii) {
if (codepoint >= 'a' && codepoint <= 'z')
result[0] = codepoint - 32;
else
result[0] = codepoint;
return 1;
}

if (flags & CaseMapTurkicAzeri && codepoint == 0x69) {
result[0] = 0x130;
return 1;
}

auto block = codepoint >> 8;
auto index = ucase_index[block] + (codepoint & 0xff);
auto delta = ucase_map[index];
Expand All @@ -310,8 +315,14 @@ uint8_t EncodingObject::codepoint_to_uppercase(nat_int_t codepoint, nat_int_t re
return 1;
}

uint8_t EncodingObject::codepoint_to_titlecase(nat_int_t codepoint, nat_int_t result[], bool ascii_only) {
if (ascii_only) return codepoint_to_uppercase(codepoint, result, true);
uint8_t EncodingObject::codepoint_to_titlecase(nat_int_t codepoint, nat_int_t result[], CaseMapType flags) {
if (flags & CaseMapAscii)
return codepoint_to_uppercase(codepoint, result, CaseMapAscii);

if (flags & CaseMapTurkicAzeri && codepoint == 0x69) {
result[0] = 0x130;
return 1;
}

auto block = codepoint >> 8;
auto index = tcase_index[block] + (codepoint & 0xff);
Expand Down
43 changes: 20 additions & 23 deletions src/string_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -2758,39 +2758,38 @@ Value StringObject::rstrip_in_place(Env *env) {

// This implements checking the case-fold options passed into arguments like
// downcase, upcase, casecmp, etc and sets a bitfield enum.
CaseFoldType StringObject::check_case_options(Env *env, Value arg1, Value arg2, CaseFoldType flags) {
CaseMapType StringObject::check_case_options(Env *env, Value arg1, Value arg2, bool downcase) {
SymbolObject *turk = "turkic"_s;
SymbolObject *lith = "lithuanian"_s;
// return for zero arg case
if (arg1.is_null() && arg2.is_null())
return flags;
return CaseMapFull;
// two arg case only accepts turkic and lithuanian (in either order)
if (!arg1.is_null() && !arg2.is_null()) {
if ((arg1 == turk && arg2 == lith) || (arg1 == lith && arg2 == turk)) {
return flags | FoldTurkicAzeri | FoldLithuanian;
return CaseMapTurkicAzeri | CaseMapLithuanian;
} else {
// any other pair of arguments is an error
env->raise("ArgumentError", "invalid option");
}
}
// acceptable symbols as options: [turkic lithuanian ascii fold]
if (arg1 == "ascii"_s) {
return flags | Ascii;
return CaseMapAscii;
} else if (arg1 == "fold"_s) {
if ((flags & (Upcase | Downcase)) == Downcase) {
flags = flags ^ (Fold | Downcase);
return flags;
if (downcase) {
return CaseMapFold;
} else {
env->raise("ArgumentError", "option :fold only allowed for downcasing");
}
} else if (arg1 == turk) {
return flags | FoldTurkicAzeri;
return CaseMapTurkicAzeri;
} else if (arg1 == lith) {
return flags | FoldLithuanian;
return CaseMapLithuanian;
} else {
env->raise("ArgumentError", "invalid option");
}
return flags;
return CaseMapFull;
}

// TODO: It is probably more efficient to do the cmp inline so that the
Expand All @@ -2816,18 +2815,17 @@ Value StringObject::is_casecmp(Env *env, Value other) {
}

StringObject *StringObject::capitalize(Env *env, Value arg1, Value arg2) {
auto flags = check_case_options(env, arg1, arg2, Fold);
auto flags = check_case_options(env, arg1, arg2);
auto str = new StringObject { "", m_encoding };
bool first_char = true;
auto ascii_only = flags & Ascii;
nat_int_t result[3] = {};
uint8_t length = 0;
for (StringView c : *this) {
nat_int_t codepoint = m_encoding->decode_codepoint(c);
if (first_char)
length = EncodingObject::codepoint_to_titlecase(codepoint, result, ascii_only);
length = EncodingObject::codepoint_to_titlecase(codepoint, result, flags);
else
length = EncodingObject::codepoint_to_lowercase(codepoint, result, ascii_only);
length = EncodingObject::codepoint_to_lowercase(codepoint, result, flags);
for (uint8_t i = 0; i < length; i++)
str->append(m_encoding->encode_codepoint(result[i]));
first_char = false;
Expand All @@ -2846,15 +2844,15 @@ Value StringObject::capitalize_in_place(Env *env, Value arg1, Value arg2) {
}

StringObject *StringObject::downcase(Env *env, Value arg1, Value arg2) {
auto flags = check_case_options(env, arg1, arg2, Downcase);
auto flags = check_case_options(env, arg1, arg2, true);
auto str = new StringObject { "", m_encoding };
nat_int_t result[3] = {};
for (StringView c : *this) {
auto codepoint = m_encoding->decode_codepoint(c);
if (flags & Ascii) {
EncodingObject::codepoint_to_lowercase(codepoint, result, true);
if (flags & CaseMapAscii) {
EncodingObject::codepoint_to_lowercase(codepoint, result, flags);
str->append(m_encoding->encode_codepoint(result[0]));
} else if ((flags & Fold || flags & FoldLithuanian) && !(flags & FoldTurkicAzeri)) {
} else if ((flags & CaseMapFold || flags & CaseMapLithuanian) && !(flags & CaseMapTurkicAzeri)) {
auto result = EncodingObject::casefold_full(codepoint);
if (result->is_array()) {
for (auto item : *result->as_array()) {
Expand All @@ -2866,7 +2864,7 @@ StringObject *StringObject::downcase(Env *env, Value arg1, Value arg2) {
str->append(m_encoding->encode_codepoint(codepoint));
}
} else {
auto length = EncodingObject::codepoint_to_lowercase(codepoint, result);
auto length = EncodingObject::codepoint_to_lowercase(codepoint, result, flags);
for (uint8_t i = 0; i < length; i++)
str->append(m_encoding->encode_codepoint(result[i]));
}
Expand Down Expand Up @@ -2895,13 +2893,12 @@ Value StringObject::dump(Env *env) {
}

StringObject *StringObject::upcase(Env *env, Value arg1, Value arg2) {
auto flags = check_case_options(env, arg1, arg2, Upcase);
auto flags = check_case_options(env, arg1, arg2);
auto str = new StringObject { "", m_encoding };
auto ascii_only = flags & Ascii;
nat_int_t result[3] = {};
for (StringView c : *this) {
auto codepoint = m_encoding->decode_codepoint(c);
auto length = EncodingObject::codepoint_to_uppercase(codepoint, result, ascii_only);
auto length = EncodingObject::codepoint_to_uppercase(codepoint, result, flags);
for (uint8_t i = 0; i < length; i++)
str->append(m_encoding->encode_codepoint(result[i]));
}
Expand All @@ -2921,7 +2918,7 @@ Value StringObject::upcase_in_place(Env *env, Value arg1, Value arg2) {

StringObject *StringObject::swapcase(Env *env, Value arg1, Value arg2) {
// currently not doing anything with the returned flags
check_case_options(env, arg1, arg2, Fold);
check_case_options(env, arg1, arg2);
auto str = new StringObject { "", m_encoding };
for (StringView c : *this) {
nat_int_t codept = m_encoding->decode_codepoint(c);
Expand Down

0 comments on commit 0617d89

Please sign in to comment.