Skip to content

Commit

Permalink
[GR-19220] Improve Ruby string interning (#3216)
Browse files Browse the repository at this point in the history
PullRequest: truffleruby/3948
  • Loading branch information
eregon committed Aug 17, 2023
2 parents c1d8453 + fd03b3e commit f9c1679
Show file tree
Hide file tree
Showing 10 changed files with 155 additions and 34 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ New features:
Bug fixes:

* Fix `Dir.glob` returning blank string entry with leading `**/` in glob and `base:` argument (@rwstauner).
* Fix class lookup after an object's class has been replaced by `IO#reopen` (@itarato, @eregon).
* Fix class lookup after an object's class has been replaced by `IO#reopen` (@itarato, @nirvdrum, @eregon).
* Fix `Marshal.load` and raise `ArgumentError` when dump is broken and is too short (#3108, @andrykonchin).
* Fix `super` method lookup for unbounded attached methods (#3131, @itarato).
* Fix `Module#define_method(name, Method)` to respect `module_function` visibility (#3181, @andrykonchin).
Expand Down Expand Up @@ -44,6 +44,7 @@ Compatibility:
Performance:

* Improve `Truffle::FeatureLoader.loaded_feature_path` by removing expensive string ops from a loop. Speeds up feature lookup time (#3010, @itarato).
* Improve `String#-@` performance by reducing unnecessary data copying and supporting substring lookups (@nirvdrum)

Changes:

Expand Down
2 changes: 1 addition & 1 deletion doc/user/truffleruby-additions.md
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ TruffleRuby provides these non-standard methods and classes that provide additio

### Concurrent Maps

`TruffleRuby::ConcurrentMap` is a key-value data structure, like a `Hash` and using `#hash` and `#eql?` to compare keys and identity to compare values. Unlike `Hash` it is unordered. All methods on `TruffleRuby::ConcurrentMap` are thread-safe but should have higher concurrency than a fully syncronized implementation. It is intended to be used by gems such as [`concurrent-ruby`](https://github.com/ruby-concurrency/concurrent-ruby) - please use via this gem rather than using directly.
`TruffleRuby::ConcurrentMap` is a key-value data structure, like a `Hash` and using `#hash` and `#eql?` to compare keys and identity to compare values. Unlike `Hash` it is unordered. All methods on `TruffleRuby::ConcurrentMap` are thread-safe but should have higher concurrency than a fully synchronized implementation. It is intended to be used by gems such as [`concurrent-ruby`](https://github.com/ruby-concurrency/concurrent-ruby) - please use via this gem rather than using directly.

* `map = TruffleRuby::ConcurrentMap.new([initial_capacity: ...], [load_factor: ...])`

Expand Down
2 changes: 1 addition & 1 deletion spec/ruby/core/io/new_spec.rb
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
require_relative '../../spec_helper'
require_relative 'shared/new'

# NOTE: should be syncronized with library/stringio/initialize_spec.rb
# NOTE: should be synchronized with library/stringio/initialize_spec.rb

describe "IO.new" do
it_behaves_like :io_new, :new
Expand Down
2 changes: 1 addition & 1 deletion spec/ruby/core/io/shared/new.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
require_relative '../fixtures/classes'

# NOTE: should be syncronized with library/stringio/initialize_spec.rb
# NOTE: should be synchronized with library/stringio/initialize_spec.rb

# This group of specs may ONLY contain specs that do successfully create
# an IO instance from the file descriptor returned by #new_fd helper.
Expand Down
6 changes: 6 additions & 0 deletions src/main/java/org/truffleruby/RubyLanguage.java
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@
import com.oracle.truffle.api.source.Source;
import com.oracle.truffle.api.source.SourceSection;
import com.oracle.truffle.api.strings.AbstractTruffleString;
import com.oracle.truffle.api.strings.InternalByteArray;
import com.oracle.truffle.api.strings.TruffleString;
import org.graalvm.options.OptionDescriptors;
import org.truffleruby.annotations.SuppressFBWarnings;
Expand Down Expand Up @@ -788,6 +789,11 @@ public ImmutableRubyString getFrozenStringLiteral(TruffleString tstring, RubyEnc
return frozenStringLiterals.getFrozenStringLiteral(tstring, encoding);
}

public ImmutableRubyString getFrozenStringLiteral(InternalByteArray byteArray, boolean isImmutable,
RubyEncoding encoding) {
return frozenStringLiterals.getFrozenStringLiteral(byteArray, isImmutable, encoding);
}

public long getNextObjectID() {
final long id = nextObjectID.getAndAdd(ObjectSpaceManager.OBJECT_ID_INCREMENT_BY);

Expand Down
19 changes: 14 additions & 5 deletions src/main/java/org/truffleruby/core/encoding/TStringUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -44,9 +44,13 @@ public static TruffleString.Encoding jcodingToTEncoding(Encoding jcoding) {
}

public static TruffleString fromByteArray(byte[] bytes, TruffleString.Encoding tencoding) {
return fromByteArray(bytes, 0, bytes.length, tencoding);
}

public static TruffleString fromByteArray(byte[] bytes, int offset, int length, TruffleString.Encoding tencoding) {
CompilerAsserts.neverPartOfCompilation(
"Use createString(TruffleString.FromByteArrayNode, byte[], RubyEncoding) instead");
return TruffleString.fromByteArrayUncached(bytes, 0, bytes.length, tencoding, false);
return TruffleString.fromByteArrayUncached(bytes, offset, length, tencoding, false);
}

public static TruffleString fromByteArray(byte[] bytes, RubyEncoding rubyEncoding) {
Expand Down Expand Up @@ -75,8 +79,7 @@ public static TruffleString fromJavaString(String javaString, RubyEncoding encod
public static byte[] getBytesOrCopy(AbstractTruffleString tstring, RubyEncoding encoding) {
CompilerAsserts.neverPartOfCompilation("uncached");
var bytes = tstring.getInternalByteArrayUncached(encoding.tencoding);
if (tstring instanceof TruffleString && bytes.getOffset() == 0 &&
bytes.getLength() == bytes.getArray().length) {
if (tstring.isImmutable() && bytes.getOffset() == 0 && bytes.getLength() == bytes.getArray().length) {
return bytes.getArray();
} else {
return ArrayUtils.extractRange(bytes.getArray(), bytes.getOffset(), bytes.getEnd());
Expand All @@ -88,8 +91,8 @@ public static byte[] getBytesOrCopy(Node node, AbstractTruffleString tstring, Tr
TruffleString.GetInternalByteArrayNode getInternalByteArrayNode,
InlinedConditionProfile noCopyProfile) {
var bytes = getInternalByteArrayNode.execute(tstring, encoding);
if (noCopyProfile.profile(node, tstring instanceof TruffleString && bytes.getOffset() == 0 &&
bytes.getLength() == bytes.getArray().length)) {
if (noCopyProfile.profile(node,
tstring.isImmutable() && bytes.getOffset() == 0 && bytes.getLength() == bytes.getArray().length)) {
return bytes.getArray();
} else {
return ArrayUtils.extractRange(bytes.getArray(), bytes.getOffset(), bytes.getEnd());
Expand Down Expand Up @@ -149,4 +152,10 @@ public static String toJavaStringOrThrow(AbstractTruffleString tstring, RubyEnco
return tstring.toJavaStringUncached();
}
}

public static boolean hasImmutableInternalByteArray(AbstractTruffleString string) {
// Immutable strings trivially have immutable byte arrays.
// Native strings also have immutable byte arrays because we need to copy the data into Java.
return string.isImmutable() || string.isNative();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@

import com.oracle.truffle.api.CompilerDirectives;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;
import com.oracle.truffle.api.strings.InternalByteArray;
import com.oracle.truffle.api.strings.TruffleString;
import org.truffleruby.collections.WeakValueCache;
import org.truffleruby.core.encoding.RubyEncoding;
Expand All @@ -37,25 +38,23 @@ public FrozenStringLiterals(TStringCache tStringCache) {

@TruffleBoundary
public ImmutableRubyString getFrozenStringLiteral(TruffleString tstring, RubyEncoding encoding) {
if (tstring.isNative()) {
throw CompilerDirectives.shouldNotReachHere();
}

return getFrozenStringLiteral(TStringUtils.getBytesOrCopy(tstring, encoding), encoding);
return getFrozenStringLiteral(tstring.getInternalByteArrayUncached(encoding.tencoding),
TStringUtils.hasImmutableInternalByteArray(tstring),
encoding);
}

@TruffleBoundary
public ImmutableRubyString getFrozenStringLiteral(byte[] bytes, RubyEncoding encoding) {
public ImmutableRubyString getFrozenStringLiteral(InternalByteArray byteArray, boolean isImmutable,
RubyEncoding encoding) {
// Ensure all ImmutableRubyString have a TruffleString from the TStringCache
var cachedTString = tstringCache.getTString(bytes, encoding);
var cachedTString = tstringCache.getTString(byteArray, isImmutable, encoding);
var tstringWithEncoding = new TStringWithEncoding(cachedTString, encoding);

final ImmutableRubyString string = values.get(tstringWithEncoding);
if (string != null) {
return string;
} else {
return values.addInCacheIfAbsent(tstringWithEncoding,
new ImmutableRubyString(cachedTString, encoding));
return values.addInCacheIfAbsent(tstringWithEncoding, new ImmutableRubyString(cachedTString, encoding));
}
}

Expand Down
7 changes: 4 additions & 3 deletions src/main/java/org/truffleruby/core/string/StringNodes.java
Original file line number Diff line number Diff line change
Expand Up @@ -4357,10 +4357,11 @@ public abstract static class InternNode extends PrimitiveArrayArgumentsNode {
@Specialization
protected ImmutableRubyString internString(RubyString string,
@Cached RubyStringLibrary libString,
@Cached TruffleString.AsManagedNode asManagedNode) {
@Cached TruffleString.GetInternalByteArrayNode getInternalByteArrayNode) {
var encoding = libString.getEncoding(string);
TruffleString immutableManagedString = asManagedNode.execute(string.tstring, encoding.tencoding);
return getLanguage().getFrozenStringLiteral(immutableManagedString, encoding);
var byteArray = getInternalByteArrayNode.execute(string.tstring, encoding.tencoding);
return getLanguage().getFrozenStringLiteral(byteArray,
TStringUtils.hasImmutableInternalByteArray(string.tstring), encoding);
}
}

Expand Down
91 changes: 87 additions & 4 deletions src/main/java/org/truffleruby/core/string/TBytesKey.java
Original file line number Diff line number Diff line change
Expand Up @@ -12,19 +12,49 @@
import java.util.Arrays;
import java.util.Objects;

import com.oracle.truffle.api.strings.InternalByteArray;
import com.oracle.truffle.api.strings.TruffleString;
import org.truffleruby.core.array.ArrayUtils;
import org.truffleruby.core.encoding.RubyEncoding;
import org.truffleruby.core.encoding.TStringUtils;

public final class TBytesKey {

private final byte[] bytes;
private final int offset;
private final int length;
private RubyEncoding encoding;
private final int bytesHashCode;

public TBytesKey(byte[] bytes, RubyEncoding encoding) {
public TBytesKey(
byte[] bytes,
int offset,
int length,
int bytesHashCode,
RubyEncoding encoding) {
this.bytes = bytes;
this.offset = offset;
this.length = length;
this.bytesHashCode = bytesHashCode;
this.encoding = encoding;
this.bytesHashCode = Arrays.hashCode(bytes);
}

public TBytesKey(byte[] bytes, RubyEncoding encoding) {
this(bytes, 0, bytes.length, Arrays.hashCode(bytes), encoding);
}

/** Supports the creation of a cache key using a subset of bytes. This key *must* be used for lookups only. If you
* want to insert into the cache, you *must* use the result of {@link #makeCacheable(boolean)}.
*
* @param byteArray A byte array retrieved from a {@link TruffleString}
* @param encoding The Ruby encoding object needed to properly decode the associated byte array */
public TBytesKey(InternalByteArray byteArray, RubyEncoding encoding) {
this(
byteArray.getArray(),
byteArray.getOffset(),
byteArray.getLength(),
hashCode(byteArray),
encoding);
}

@Override
Expand All @@ -37,15 +67,15 @@ public boolean equals(Object o) {
if (o instanceof TBytesKey) {
final TBytesKey other = (TBytesKey) o;
if (encoding == null) {
if (Arrays.equals(bytes, other.bytes)) {
if (equalBytes(this, other)) {
// For getMatchedEncoding()
this.encoding = Objects.requireNonNull(other.encoding);
return true;
} else {
return false;
}
} else {
return encoding == other.encoding && Arrays.equals(bytes, other.bytes);
return encoding == other.encoding && equalBytes(this, other);
}
}

Expand All @@ -62,4 +92,57 @@ public String toString() {
return TruffleString.fromByteArrayUncached(bytes, encoding, false).toString();
}

private static int hashCode(InternalByteArray byteArray) {
return hashCode(byteArray.getArray(), byteArray.getOffset(), byteArray.getLength());
}

/** A variant of {@link Arrays#hashCode(byte[])} that allows for selecting a range within the array. */
private static int hashCode(byte[] bytes, int offset, int length) {
int result = 1;
for (int i = offset; i < offset + length; i++) {
result = 31 * result + bytes[i];
}

return result;
}

private boolean equalBytes(TBytesKey a, TBytesKey b) {
return Arrays.equals(a.bytes, a.offset, a.offset + a.length, b.bytes, b.offset, b.offset + b.length);
}

private boolean isPerfectFit() {
return offset == 0 && length == bytes.length;
}

/** Returns a cache key suitable for insertion into the string cache. It's quite common that we want to cache a
* substring. Since we don't want to retain the entire original string, we resolve the substring by making a copy of
* the byte range that we need. However, that is a costly operation and that work is discarded in the event of a
* cache hit. To avoid incurring that cost unnecessarily, we allow cache keys to refer to a subset of a byte array.
* While that saves computation during a cache lookup, it means such keys are unsuitable for insertion into the
* cache. This method makes a key we can use safely for insertion.
* <p>
* If we know that the key refers to an immutable byte array and the key does not refer to a substring, we can
* safely refer to the original byte array without needing to make an additional copy.
*
* @param isImmutable whether the key's byte array is immutable
* @return a cache key suitable for insertion */
public TBytesKey makeCacheable(boolean isImmutable) {
if (isImmutable && isPerfectFit()) {
return this;
}

// Make a copy of the substring's bytes so we can cache them without retaining the larger original byte array.
var resolvedSubstring = ArrayUtils.extractRange(this.bytes, this.offset, this.offset + this.length);

return new TBytesKey(resolvedSubstring, encoding);
}

public TBytesKey withNewEncoding(RubyEncoding encoding) {
return new TBytesKey(bytes, offset, length, bytesHashCode, encoding);
}

public TruffleString toTruffleString() {
return TStringUtils.fromByteArray(bytes, offset, length, encoding.tencoding);
}

}
40 changes: 31 additions & 9 deletions src/main/java/org/truffleruby/core/string/TStringCache.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
*/
package org.truffleruby.core.string;

import com.oracle.truffle.api.strings.InternalByteArray;
import com.oracle.truffle.api.strings.TruffleString;
import org.truffleruby.collections.WeakValueCache;
import org.truffleruby.core.encoding.Encodings;
Expand All @@ -20,6 +21,10 @@
import com.oracle.truffle.api.CompilerDirectives;
import com.oracle.truffle.api.CompilerDirectives.TruffleBoundary;

/** This cache caches (byte[], encoding) to TruffleString. The main value is from using it for string literals in files
* without {@code # frozen_string_literal: true}, so equivalent string literals are shared. For most other usages there
* is another higher-level cache but this cache then helps to deduplicate TruffleString's across the different
* higher-level caches. */
public final class TStringCache {

private final WeakValueCache<TBytesKey, TruffleString> bytesToTString = new WeakValueCache<>();
Expand Down Expand Up @@ -69,20 +74,38 @@ private void register(TruffleString tstring, RubyEncoding encoding) {
}
}

public TruffleString getTString(TruffleString string, RubyEncoding encoding) {
return getTString(TStringUtils.getBytesOrCopy(string, encoding), encoding);
@TruffleBoundary
public TruffleString getTString(TruffleString string, RubyEncoding rubyEncoding) {
assert rubyEncoding != null;

var byteArray = string.getInternalByteArrayUncached(rubyEncoding.tencoding);
final TBytesKey key = new TBytesKey(byteArray, rubyEncoding);

return getTString(key, TStringUtils.hasImmutableInternalByteArray(string));
}

@TruffleBoundary
public TruffleString getTString(InternalByteArray byteArray, boolean isImmutable, RubyEncoding rubyEncoding) {
assert rubyEncoding != null;

return getTString(new TBytesKey(byteArray, rubyEncoding), isImmutable);
}

@TruffleBoundary
public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
assert rubyEncoding != null;

final TBytesKey key = new TBytesKey(bytes, rubyEncoding);
return getTString(new TBytesKey(bytes, rubyEncoding), true);
}

@TruffleBoundary
private TruffleString getTString(TBytesKey lookupKey, boolean isLookupKeyImmutable) {
final TruffleString tstring = bytesToTString.get(lookupKey);
var rubyEncoding = lookupKey.getMatchedEncoding();

final TruffleString tstring = bytesToTString.get(key);
if (tstring != null) {
++tstringsReusedCount;
tstringBytesSaved += tstring.byteLength(rubyEncoding.tencoding);
tstringBytesSaved += tstring.byteLength(lookupKey.getMatchedEncoding().tencoding);

return tstring;
}
Expand All @@ -92,7 +115,7 @@ public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
// reference equality optimizations. So, do another search but with a marker encoding. The only guarantee
// we can make about the resulting TruffleString is that it would have the same logical byte[], but that's good enough
// for our purposes.
TBytesKey keyNoEncoding = new TBytesKey(bytes, null);
TBytesKey keyNoEncoding = lookupKey.withNewEncoding(null);
final TruffleString tstringWithSameBytesButDifferentEncoding = bytesToTString.get(keyNoEncoding);

final TruffleString newTString;
Expand All @@ -104,12 +127,11 @@ public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) {
++byteArrayReusedCount;
tstringBytesSaved += newTString.byteLength(rubyEncoding.tencoding);
} else {
newTString = TStringUtils.fromByteArray(bytes, rubyEncoding);
newTString = lookupKey.toTruffleString();
}

// Use the new TruffleString bytes in the cache, so we do not keep bytes alive unnecessarily.
final TBytesKey newKey = new TBytesKey(TStringUtils.getBytesOrCopy(newTString, rubyEncoding), rubyEncoding);
return bytesToTString.addInCacheIfAbsent(newKey, newTString);
return bytesToTString.addInCacheIfAbsent(lookupKey.makeCacheable(isLookupKeyImmutable), newTString);
}

public boolean contains(TruffleString string, RubyEncoding encoding) {
Expand Down

0 comments on commit f9c1679

Please sign in to comment.