Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement String#each_grapheme_cluster #2108

Merged
merged 1 commit into from
Jun 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions include/natalie/encoding/utf8_encoding_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,8 @@ class Utf8EncodingObject : public EncodingObject {
virtual std::pair<bool, StringView> prev_char(const String &string, size_t *index) const override;
virtual std::pair<bool, StringView> next_char(const String &string, size_t *index) const override;

virtual StringView next_grapheme_cluster(const String &string, size_t *index) const override;

virtual bool is_printable_char(const nat_int_t c) const override;
virtual String escaped_char(const nat_int_t c) const override;

Expand Down
7 changes: 7 additions & 0 deletions include/natalie/encoding_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,15 @@ class EncodingObject : public Object {
virtual bool is_dummy() const { return false; }

virtual bool valid_codepoint(nat_int_t codepoint) const = 0;

virtual std::pair<bool, StringView> prev_char(const String &, size_t *) const = 0;
virtual std::pair<bool, StringView> next_char(const String &, size_t *) const = 0;

virtual StringView next_grapheme_cluster(const String &str, size_t *index) const {
auto [_valid, view] = next_char(str, index);
return view;
}

virtual bool is_printable_char(const nat_int_t c) const;
virtual String escaped_char(const nat_int_t c) const = 0;
virtual Value encode(Env *, EncodingObject *, StringObject *) const;
Expand Down
2 changes: 2 additions & 0 deletions include/natalie/string_object.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,6 +212,8 @@ class StringObject : public Object {
Value each_codepoint(Env *, Block *);
Value codepoints(Env *, Block *);

Value each_grapheme_cluster(Env *, Block *);

Value each_line(Env *, Value = nullptr, Value = nullptr, Block * = nullptr);
Value lines(Env *, Value = nullptr, Value = nullptr, Block * = nullptr);

Expand Down
1 change: 1 addition & 0 deletions lib/natalie/compiler/binding_gen.rb
Original file line number Diff line number Diff line change
Expand Up @@ -1262,6 +1262,7 @@ def generate_name
gen.binding('String', 'each_byte', 'StringObject', 'each_byte', argc: 0, pass_env: true, pass_block: true, return_type: :Object)
gen.binding('String', 'each_char', 'StringObject', 'each_char', argc: 0, pass_env: true, pass_block: true, return_type: :Object)
gen.binding('String', 'each_codepoint', 'StringObject', 'each_codepoint', argc: 0, pass_env: true, pass_block: true, return_type: :Object)
gen.binding('String', 'each_grapheme_cluster', 'StringObject', 'each_grapheme_cluster', argc: 0, pass_env: true, pass_block: true, return_type: :Object)
gen.binding('String', 'each_line', 'StringObject', 'each_line', argc: 0..1, kwargs: [:chomp], pass_env: true, pass_block: true, return_type: :Object)
gen.binding('String', 'empty?', 'StringObject', 'is_empty', argc: 0, pass_env: false, pass_block: false, return_type: :bool)
gen.binding('String', 'encode', 'StringObject', 'encode', argc: 1, pass_env: true, pass_block: false, return_type: :Object)
Expand Down
16 changes: 16 additions & 0 deletions spec/core/string/each_grapheme_cluster_spec.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
require_relative "../../spec_helper"
require_relative 'shared/chars'
require_relative 'shared/grapheme_clusters'
require_relative 'shared/each_char_without_block'

describe "String#each_grapheme_cluster" do
it_behaves_like :string_chars, :each_grapheme_cluster
it_behaves_like :string_grapheme_clusters, :each_grapheme_cluster
it_behaves_like :string_each_char_without_block, :each_grapheme_cluster

it "yields String instances for subclasses" do
a = []
StringSpecs::MyString.new("abc").each_grapheme_cluster { |s| a << s.class }
a.should == [String, String, String]
end
end
45 changes: 45 additions & 0 deletions src/encoding/utf8_encoding_object.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
#include "natalie/encoding/utf8_encoding_object.hpp"
#include "natalie.hpp"

namespace Natalie {
Expand Down Expand Up @@ -145,6 +146,50 @@ std::pair<bool, StringView> Utf8EncodingObject::next_char(const String &string,
return { valid, StringView(&string, i, length) };
}

StringView Utf8EncodingObject::next_grapheme_cluster(const String &string, size_t *index) const {
auto [valid, view] = next_char(string, index);

bool join_next = false;
auto index2 = *index;
for (;;) {
auto [valid2, view2] = next_char(string, &index2);
if (!valid2 || view2.is_empty())
break;

// This is a silly way to get his number. Maybe we need an EncodingObject::next_codepoint API...?
auto codepoint = decode_codepoint(view2);

// https://en.wikipedia.org/wiki/Variation_Selectors_(Unicode_block)
if (codepoint >= 0xFE00 && codepoint <= 0xFE0F) {
view = StringView { &string, view.offset(), view.size() + view2.size() };
*index = index2;
continue;
}

// Zero-width joiner
// https://unicode-explorer.com/c/200D
if (codepoint == 0x200D) {
view = StringView { &string, view.offset(), view.size() + view2.size() };
*index = index2;
join_next = true;
continue;
}

break;
}

if (join_next) {
index2 = *index;
auto [valid2, view2] = next_char(string, &index2);
if (!valid2 || view2.is_empty())
return view;
view = StringView { &string, view.offset(), view.size() + view2.size() };
*index = index2;
}

return view;
}

/*
0x00..0x1F, 0x7F: C0 controls (same as ASCII)
0x80..0x9F: C1 controls
Expand Down
17 changes: 17 additions & 0 deletions src/string_object.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -202,6 +202,23 @@ Value StringObject::codepoints(Env *env, Block *block) {
return ary;
}

Value StringObject::each_grapheme_cluster(Env *env, Block *block) {
if (!block) {
Block *size_block = new Block { env, this, StringObject::size_fn, 0 };
return send(env, "enum_for"_s, { "each_grapheme_cluster"_s }, size_block);
}

size_t index = 0;
for (;;) {
auto view = m_encoding->next_grapheme_cluster(m_string, &index);
if (view.is_empty())
break;
Value args[] = { new StringObject { view, m_encoding } };
NAT_RUN_BLOCK_AND_POSSIBLY_BREAK(env, block, Args(1, args), nullptr);
}
return this;
}

String create_padding(String &padding, size_t length) {
size_t quotient = ::floor(length / padding.size());
size_t remainder = length % padding.size();
Expand Down
Loading