Skip to content

Commit

Permalink
Add support for 512-bit vectors in utf-8 validator (simdjson#32)
Browse files Browse the repository at this point in the history
  • Loading branch information
piotrrzysko authored and Squiry committed Feb 29, 2024
1 parent ab13a8c commit 59ae5f4
Show file tree
Hide file tree
Showing 7 changed files with 44 additions and 29 deletions.
8 changes: 4 additions & 4 deletions src/main/java/org/simdjson/CharactersClassifier.java
Original file line number Diff line number Diff line change
Expand Up @@ -9,14 +9,14 @@ class CharactersClassifier {

private static final ByteVector WHITESPACE_TABLE =
ByteVector.fromArray(
StructuralIndexer.SPECIES,
repeat(new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100}, StructuralIndexer.SPECIES.vectorByteSize() / 4),
StructuralIndexer.BYTE_SPECIES,
repeat(new byte[]{' ', 100, 100, 100, 17, 100, 113, 2, 100, '\t', '\n', 112, 100, '\r', 100, 100}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4),
0);

private static final ByteVector OP_TABLE =
ByteVector.fromArray(
StructuralIndexer.SPECIES,
repeat(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0}, StructuralIndexer.SPECIES.vectorByteSize() / 4),
StructuralIndexer.BYTE_SPECIES,
repeat(new byte[]{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ':', '{', ',', '}', 0, 0}, StructuralIndexer.BYTE_SPECIES.vectorByteSize() / 4),
0);

private static byte[] repeat(byte[] array, int n) {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/simdjson/JsonStringScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -14,8 +14,8 @@ class JsonStringScanner {
private long prevEscaped = 0;

JsonStringScanner() {
this.backslashMask = ByteVector.broadcast(StructuralIndexer.SPECIES, (byte) '\\');
this.quoteMask = ByteVector.broadcast(StructuralIndexer.SPECIES, (byte) '"');
this.backslashMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '\\');
this.quoteMask = ByteVector.broadcast(StructuralIndexer.BYTE_SPECIES, (byte) '"');
}

JsonStringBlock next(ByteVector chunk0) {
Expand Down
4 changes: 2 additions & 2 deletions src/main/java/org/simdjson/StringParser.java
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ class StringParser {

private static final byte BACKSLASH = '\\';
private static final byte QUOTE = '"';
private static final int BYTES_PROCESSED = StructuralIndexer.SPECIES.vectorByteSize();
private static final int BYTES_PROCESSED = StructuralIndexer.BYTE_SPECIES.vectorByteSize();
private static final int MIN_HIGH_SURROGATE = 0xD800;
private static final int MAX_HIGH_SURROGATE = 0xDBFF;
private static final int MIN_LOW_SURROGATE = 0xDC00;
Expand All @@ -31,7 +31,7 @@ void parseString(byte[] buffer, int idx) {
int src = idx + 1;
int dst = stringBufferIdx + Integer.BYTES;
while (true) {
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.SPECIES, buffer, src);
ByteVector srcVec = ByteVector.fromArray(StructuralIndexer.BYTE_SPECIES, buffer, src);
srcVec.intoArray(stringBuffer, dst);
long backslashBits = srcVec.eq(BACKSLASH).toLong();
long quoteBits = srcVec.eq(QUOTE).toLong();
Expand Down
40 changes: 28 additions & 12 deletions src/main/java/org/simdjson/StructuralIndexer.java
Original file line number Diff line number Diff line change
@@ -1,27 +1,43 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.IntVector;
import jdk.incubator.vector.VectorShape;
import jdk.incubator.vector.VectorSpecies;
import java.lang.invoke.MethodType;

import static jdk.incubator.vector.VectorOperators.UNSIGNED_LE;

class StructuralIndexer {

static final VectorSpecies<Byte> SPECIES;
static final VectorSpecies<Integer> INT_SPECIES;
static final VectorSpecies<Byte> BYTE_SPECIES;
static final int N_CHUNKS;

static {
String species = System.getProperty("org.simdjson.species", "preferred");
SPECIES = switch(species) {
case "preferred" -> ByteVector.SPECIES_PREFERRED;
case "512" -> ByteVector.SPECIES_512;
case "256" -> ByteVector.SPECIES_256;
switch (species) {
case "preferred" -> {
BYTE_SPECIES = ByteVector.SPECIES_PREFERRED;
INT_SPECIES = IntVector.SPECIES_PREFERRED;
}
case "512" -> {
BYTE_SPECIES = ByteVector.SPECIES_512;
INT_SPECIES = IntVector.SPECIES_512;
}
case "256" -> {
BYTE_SPECIES = ByteVector.SPECIES_256;
INT_SPECIES = IntVector.SPECIES_256;
}
default -> throw new IllegalArgumentException("Unsupported vector species: " + species);
};
N_CHUNKS = 64 / SPECIES.vectorByteSize();
if (SPECIES != ByteVector.SPECIES_256 && SPECIES != ByteVector.SPECIES_512) {
throw new IllegalArgumentException("Unsupported vector species: " + SPECIES);
}
N_CHUNKS = 64 / BYTE_SPECIES.vectorByteSize();
assertSupportForSpecies(BYTE_SPECIES);
assertSupportForSpecies(INT_SPECIES);
}

private static void assertSupportForSpecies(VectorSpecies<?> species) {
if (species.vectorShape() != VectorShape.S_256_BIT && species.vectorShape() != VectorShape.S_512_BIT) {
throw new IllegalArgumentException("Unsupported vector species: " + species);
}
}

Expand All @@ -48,7 +64,7 @@ void step(byte[] buffer, int offset, int blockIndex) {
}

private void step1(byte[] buffer, int offset, int blockIndex) {
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_512, buffer, offset);
ByteVector chunk0 = ByteVector.fromArray(ByteVector.SPECIES_512, buffer, offset);
JsonStringBlock strings = stringScanner.next(chunk0);
JsonCharacterBlock characters = classifier.classify(chunk0);
long unescaped = lteq(chunk0, (byte) 0x1F);
Expand All @@ -75,7 +91,7 @@ private void finishStep(JsonCharacterBlock characters, JsonStringBlock strings,
bitIndexes.write(blockIndex, prevStructurals);
prevStructurals = potentialStructuralStart & ~strings.stringTail();
unescapedCharsError |= strings.nonQuoteInsideString(unescaped);
}
}

private long lteq(ByteVector chunk0, byte scalar) {
long r = chunk0.compare(UNSIGNED_LE, scalar).toLong();
Expand Down
11 changes: 6 additions & 5 deletions src/main/java/org/simdjson/Utf8Validator.java
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,12 @@

import java.util.Arrays;

public class Utf8Validator {
private static final VectorSpecies<Byte> VECTOR_SPECIES = ByteVector.SPECIES_256;
class Utf8Validator {

private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES;
private static final ByteVector INCOMPLETE_CHECK = getIncompleteCheck();
private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(IntVector.SPECIES_256,
IntVector.SPECIES_256.elementSize() - 1, 1, true);
private static final VectorShuffle<Integer> SHIFT_FOUR_BYTES_FORWARD = VectorShuffle.iota(StructuralIndexer.INT_SPECIES,
StructuralIndexer.INT_SPECIES.elementSize() - 1, 1, true);
private static final ByteVector LOW_NIBBLE_MASK = ByteVector.broadcast(VECTOR_SPECIES, 0b0000_1111);
private static final ByteVector ALL_ASCII_MASK = ByteVector.broadcast(VECTOR_SPECIES, (byte) 0b1000_0000);

Expand Down Expand Up @@ -39,7 +40,7 @@ static void validate(byte[] inputBytes) {

errors |= secondCheck.compare(VectorOperators.NE, 0).toLong();
}
previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(IntVector.SPECIES_256.length() - 1);
previousFourUtf8Bytes = utf8Vector.reinterpretAsInts().lane(StructuralIndexer.INT_SPECIES.length() - 1);
}

// if the input file doesn't align with the vector width, pad the missing bytes with zero
Expand Down
2 changes: 1 addition & 1 deletion src/test/java/org/simdjson/TestUtils.java
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ static String padWithSpaces(String str) {
}

static ByteVector chunk(String str, int n) {
return ByteVector.fromArray(StructuralIndexer.SPECIES, str.getBytes(UTF_8), n * StructuralIndexer.SPECIES.vectorByteSize());
return ByteVector.fromArray(StructuralIndexer.BYTE_SPECIES, str.getBytes(UTF_8), n * StructuralIndexer.BYTE_SPECIES.vectorByteSize());
}

static byte[] toUtf8(String str) {
Expand Down
4 changes: 1 addition & 3 deletions src/test/java/org/simdjson/Utf8ValidatorTest.java
Original file line number Diff line number Diff line change
@@ -1,19 +1,17 @@
package org.simdjson;

import jdk.incubator.vector.ByteVector;
import jdk.incubator.vector.VectorSpecies;
import org.junit.jupiter.api.Test;
import org.junit.jupiter.params.ParameterizedTest;
import org.junit.jupiter.params.provider.ValueSource;

import java.io.IOException;
import java.util.Arrays;
import java.util.Objects;

import static org.assertj.core.api.Assertions.*;

class Utf8ValidatorTest {
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.SPECIES;
private static final VectorSpecies<Byte> VECTOR_SPECIES = StructuralIndexer.BYTE_SPECIES;


/* ASCII / 1 BYTE TESTS */
Expand Down

0 comments on commit 59ae5f4

Please sign in to comment.