-
Notifications
You must be signed in to change notification settings - Fork 185
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Improve Ruby string interning #3185
Changes from all commits
f19f382
ce9d91b
f24b689
c19108e
35817af
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -12,19 +12,44 @@ | |
import java.util.Arrays; | ||
import java.util.Objects; | ||
|
||
import com.oracle.truffle.api.strings.InternalByteArray; | ||
import com.oracle.truffle.api.strings.TruffleString; | ||
import org.truffleruby.core.array.ArrayUtils; | ||
import org.truffleruby.core.encoding.RubyEncoding; | ||
import org.truffleruby.core.encoding.TStringUtils; | ||
|
||
public final class TBytesKey { | ||
|
||
private final byte[] bytes; | ||
private final int offset; | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we cannot construct TruffleString's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. On the plus side this actually consumes less memory (i.e., we don't keep the |
||
private final int length; | ||
private RubyEncoding encoding; | ||
private final int bytesHashCode; | ||
|
||
public TBytesKey(byte[] bytes, RubyEncoding encoding) { | ||
public TBytesKey( | ||
byte[] bytes, | ||
int offset, | ||
int length, | ||
int bytesHashCode, | ||
RubyEncoding encoding) { | ||
this.bytes = bytes; | ||
this.offset = offset; | ||
this.length = length; | ||
this.bytesHashCode = bytesHashCode; | ||
this.encoding = encoding; | ||
this.bytesHashCode = Arrays.hashCode(bytes); | ||
} | ||
|
||
public TBytesKey(byte[] bytes, RubyEncoding encoding) { | ||
this(bytes, 0, bytes.length, Arrays.hashCode(bytes), encoding); | ||
} | ||
|
||
public TBytesKey(InternalByteArray byteArray, RubyEncoding encoding) { | ||
this( | ||
byteArray.getArray(), | ||
byteArray.getOffset(), | ||
byteArray.getLength(), | ||
hashCode(byteArray), | ||
encoding); | ||
} | ||
|
||
@Override | ||
|
@@ -37,15 +62,15 @@ public boolean equals(Object o) { | |
if (o instanceof TBytesKey) { | ||
final TBytesKey other = (TBytesKey) o; | ||
if (encoding == null) { | ||
if (Arrays.equals(bytes, other.bytes)) { | ||
if (equalBytes(this, other)) { | ||
// For getMatchedEncoding() | ||
this.encoding = Objects.requireNonNull(other.encoding); | ||
return true; | ||
} else { | ||
return false; | ||
} | ||
} else { | ||
return encoding == other.encoding && Arrays.equals(bytes, other.bytes); | ||
return encoding == other.encoding && equalBytes(this, other); | ||
} | ||
} | ||
|
||
|
@@ -62,4 +87,51 @@ public String toString() { | |
return TruffleString.fromByteArrayUncached(bytes, encoding, false).toString(); | ||
} | ||
|
||
private static int hashCode(InternalByteArray byteArray) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it would be nice if we could upstream this into Truffle's There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think in InternalByteArray would be a better place, TruffleString has the byte[] intrinsics nowadays. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. That'd be fine, too. There had been some concerns previously about exposing public APIs in something named "internal", but I don't really care where it goes. |
||
return hashCode(byteArray.getArray(), byteArray.getOffset(), byteArray.getLength()); | ||
} | ||
|
||
// A variant of <code>Arrays.hashCode</code> that allows for selecting a range within the array. | ||
private static int hashCode(byte[] bytes, int offset, int length) { | ||
if (bytes == null) { | ||
return 0; | ||
} | ||
|
||
int result = 1; | ||
for (int i = offset; i < offset + length; i++) { | ||
result = 31 * result + bytes[i]; | ||
} | ||
|
||
return result; | ||
} | ||
|
||
private boolean equalBytes(TBytesKey a, TBytesKey b) { | ||
if (a.isPerfectFit() && b.isPerfectFit()) { | ||
return Arrays.equals(a.bytes, b.bytes); | ||
} | ||
Comment on lines
+109
to
+111
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There doesn't seem to be a big perf advantage here so I'd just remove this and use the general case below. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Will Graal optimize both cases? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. They both end up in There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It's fine as it is, no need to spend too much time on it, either way is fine. |
||
|
||
return Arrays.equals(a.bytes, a.offset, a.offset + a.length, b.bytes, b.offset, b.offset + b.length); | ||
} | ||
|
||
private boolean isPerfectFit() { | ||
return offset == 0 && length == bytes.length; | ||
} | ||
|
||
public TBytesKey makeCacheable(boolean isImmutable) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Could you document this? (it creates a perfect fit key, so it does not hold on any extra bytes that it does not need when inserting a new entry to avoid leaking bytes outside the substring) |
||
if (isImmutable && isPerfectFit()) { | ||
return new TBytesKey(bytes, encoding); | ||
} | ||
|
||
var simplified = ArrayUtils.extractRange(this.bytes, this.offset, this.offset + this.length); | ||
return new TBytesKey(simplified, encoding); | ||
} | ||
|
||
public TBytesKey withNewEncoding(RubyEncoding encoding) { | ||
return new TBytesKey(bytes, offset, length, bytesHashCode, encoding); | ||
} | ||
|
||
public TruffleString toTruffleString() { | ||
return TStringUtils.fromByteArray(bytes, offset, length, encoding.tencoding); | ||
} | ||
|
||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -9,6 +9,7 @@ | |
*/ | ||
package org.truffleruby.core.string; | ||
|
||
import com.oracle.truffle.api.strings.InternalByteArray; | ||
import com.oracle.truffle.api.strings.TruffleString; | ||
import org.truffleruby.collections.WeakValueCache; | ||
import org.truffleruby.core.encoding.Encodings; | ||
|
@@ -69,20 +70,38 @@ private void register(TruffleString tstring, RubyEncoding encoding) { | |
} | ||
} | ||
|
||
public TruffleString getTString(TruffleString string, RubyEncoding encoding) { | ||
return getTString(TStringUtils.getBytesOrCopy(string, encoding), encoding); | ||
@TruffleBoundary | ||
public TruffleString getTString(TruffleString string, RubyEncoding rubyEncoding) { | ||
assert rubyEncoding != null; | ||
|
||
var byteArray = string.getInternalByteArrayUncached(rubyEncoding.tencoding); | ||
final TBytesKey key = new TBytesKey(byteArray, rubyEncoding); | ||
|
||
return getTString(key, TStringUtils.hasImmutableInternalByteArray(string)); | ||
} | ||
|
||
@TruffleBoundary | ||
public TruffleString getTString(InternalByteArray byteArray, boolean isImmutable, RubyEncoding rubyEncoding) { | ||
nirvdrum marked this conversation as resolved.
Show resolved
Hide resolved
|
||
assert rubyEncoding != null; | ||
|
||
return getTString(new TBytesKey(byteArray, rubyEncoding), isImmutable); | ||
} | ||
|
||
@TruffleBoundary | ||
public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should still be There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I really wish we had a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Feel free to add a code comment about it. Here it's a case of "no value to PE this code, and worse due to forcing an extra allocation that might not be needed otherwise". We should only PE what can benefit from PE, there is a warmup cost to PE too much code. |
||
assert rubyEncoding != null; | ||
|
||
final TBytesKey key = new TBytesKey(bytes, rubyEncoding); | ||
return getTString(new TBytesKey(bytes, rubyEncoding), true); | ||
} | ||
|
||
@TruffleBoundary | ||
private TruffleString getTString(TBytesKey lookupKey, boolean isLookupKeyImmutable) { | ||
final TruffleString tstring = bytesToTString.get(lookupKey); | ||
var rubyEncoding = lookupKey.getMatchedEncoding(); | ||
|
||
final TruffleString tstring = bytesToTString.get(key); | ||
if (tstring != null) { | ||
++tstringsReusedCount; | ||
tstringBytesSaved += tstring.byteLength(rubyEncoding.tencoding); | ||
tstringBytesSaved += tstring.byteLength(lookupKey.getMatchedEncoding().tencoding); | ||
|
||
return tstring; | ||
} | ||
|
@@ -92,7 +111,7 @@ public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) { | |
// reference equality optimizations. So, do another search but with a marker encoding. The only guarantee | ||
// we can make about the resulting TruffleString is that it would have the same logical byte[], but that's good enough | ||
// for our purposes. | ||
TBytesKey keyNoEncoding = new TBytesKey(bytes, null); | ||
TBytesKey keyNoEncoding = lookupKey.withNewEncoding(null); | ||
final TruffleString tstringWithSameBytesButDifferentEncoding = bytesToTString.get(keyNoEncoding); | ||
|
||
final TruffleString newTString; | ||
|
@@ -104,12 +123,11 @@ public TruffleString getTString(byte[] bytes, RubyEncoding rubyEncoding) { | |
++byteArrayReusedCount; | ||
tstringBytesSaved += newTString.byteLength(rubyEncoding.tencoding); | ||
} else { | ||
newTString = TStringUtils.fromByteArray(bytes, rubyEncoding); | ||
newTString = lookupKey.toTruffleString(); | ||
} | ||
|
||
// Use the new TruffleString bytes in the cache, so we do not keep bytes alive unnecessarily. | ||
final TBytesKey newKey = new TBytesKey(TStringUtils.getBytesOrCopy(newTString, rubyEncoding), rubyEncoding); | ||
return bytesToTString.addInCacheIfAbsent(newKey, newTString); | ||
return bytesToTString.addInCacheIfAbsent(lookupKey.makeCacheable(isLookupKeyImmutable), newTString); | ||
} | ||
|
||
public boolean contains(TruffleString string, RubyEncoding encoding) { | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Could you document here or somewhere else why we support offset and length? (to avoid extra copying for substrings during lookup, but not when inserting a new entry, ref #makeCacheable)