Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(NODE-6537): add support for binary vectors #730

Merged
merged 13 commits into from
Nov 18, 2024
5 changes: 0 additions & 5 deletions .evergreen/run-big-endian-test.sh

This file was deleted.

10 changes: 10 additions & 0 deletions etc/Dockerfile
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
FROM node:22 AS build

WORKDIR /bson
COPY . .

RUN rm -rf node_modules && npm install && npm test

FROM scratch

COPY --from=build /bson/docs/ /
22 changes: 22 additions & 0 deletions etc/run-big-endian-test.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
#!/usr/bin/env bash

# At the time of writing. This script is not used in CI.
# but can be used to locally iterate on big endian bugs.
# buildx requires an output, so I put docs which should be a no-op.

set -o errexit
set -o nounset
set -o pipefail
set -o xtrace

# If you get an error you may have an outdated buildkit version
# Try running this:
# docker buildx rm builder && docker buildx create --name builder --bootstrap --use

docker buildx build \
--progress=plain \
--platform linux/s390x \
--build-arg="NODE_ARCH=s390x" \
-f ./etc/Dockerfile \
--output type=local,dest=./docs,platform-split=false \
.
218 changes: 218 additions & 0 deletions src/binary.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import { BSONError } from './error';
import { BSON_BINARY_SUBTYPE_UUID_NEW } from './constants';
import { ByteUtils } from './utils/byte_utils';
import { BSONValue } from './bson_value';
import { NumberUtils } from './utils/number_utils';

/** @public */
export type BinarySequence = Uint8Array | number[];
Expand Down Expand Up @@ -58,9 +59,18 @@ export class Binary extends BSONValue {
static readonly SUBTYPE_COLUMN = 7;
/** Sensitive BSON type */
static readonly SUBTYPE_SENSITIVE = 8;
/** Vector BSON type */
static readonly SUBTYPE_VECTOR = 9;
/** User BSON type */
static readonly SUBTYPE_USER_DEFINED = 128;

/** datatype of a Binary Vector (subtype: 9) */
static readonly VECTOR_TYPE = Object.freeze({
Int8: 0x03,
Float32: 0x27,
PackedBit: 0x10
} as const);
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved

/**
* The bytes of the Binary value.
*
Expand Down Expand Up @@ -238,6 +248,11 @@ export class Binary extends BSONValue {
/** @internal */
toExtendedJSON(options?: EJSONOptions): BinaryExtendedLegacy | BinaryExtended {
options = options || {};

if (this.sub_type === Binary.SUBTYPE_VECTOR) {
validateBinaryVector(this);
}

const base64String = ByteUtils.toBase64(this.buffer);

const subType = Number(this.sub_type).toString(16);
Expand Down Expand Up @@ -310,6 +325,209 @@ export class Binary extends BSONValue {
const subTypeArg = inspect(this.sub_type, options);
return `Binary.createFromBase64(${base64Arg}, ${subTypeArg})`;
}

/**
* If this Binary represents a Int8 Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.Int8`),
* returns a copy of the bytes in a new Int8Array.
addaleax marked this conversation as resolved.
Show resolved Hide resolved
*
* If the Binary is not a Vector, or the datatype is not Int8, an error is thrown.
*/
public toInt8Array(): Int8Array {
if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
throw new BSONError('Binary sub_type is not Vector');
}

if (this.buffer[0] !== Binary.VECTOR_TYPE.Int8) {
throw new BSONError('Binary datatype field is not Int8');
}

return new Int8Array(
this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
);
}

/**
* If this Binary represents a Float32 Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.Float32`),
* returns a copy of the bytes in a new Float32Array.
*
* If the Binary is not a Vector, or the datatype is not Float32, an error is thrown.
*/
public toFloat32Array(): Float32Array {
if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
throw new BSONError('Binary sub_type is not Vector');
}

if (this.buffer[0] !== Binary.VECTOR_TYPE.Float32) {
throw new BSONError('Binary datatype field is not Float32');
}

const floatBytes = new Uint8Array(
this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
);

if (NumberUtils.isBigEndian) ByteUtils.swap32(floatBytes);

return new Float32Array(floatBytes.buffer);
}

/**
* If this Binary represents packed bit Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.PackedBit`),
* returns a copy of the bytes that are packed bits.
*
* Use `toBits` to get the unpacked bits.
*
* If the Binary is not a Vector, or the datatype is not PackedBit, an error is thrown.
*/
public toPackedBits(): Uint8Array {
if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
throw new BSONError('Binary sub_type is not Vector');
}

if (this.buffer[0] !== Binary.VECTOR_TYPE.PackedBit) {
throw new BSONError('Binary datatype field is not packed bit');
}

return new Uint8Array(
this.buffer.buffer.slice(this.buffer.byteOffset + 2, this.buffer.byteOffset + this.position)
);
}

/**
* If this Binary represents a Packed bit Vector (`binary.buffer[0] === Binary.VECTOR_TYPE.PackedBit`),
* returns a copy of the bit unpacked into a new Int8Array.
*
* Use `toPackedBits` to get the bits still in packed form.
*
* If the Binary is not a Vector, or the datatype is not PackedBit, an error is thrown.
*/
public toBits(): Int8Array {
if (this.sub_type !== Binary.SUBTYPE_VECTOR) {
throw new BSONError('Binary sub_type is not Vector');
}

if (this.buffer[0] !== Binary.VECTOR_TYPE.PackedBit) {
throw new BSONError('Binary datatype field is not packed bit');
}

const byteCount = this.length() - 2;
const bitCount = byteCount * 8 - this.buffer[1];
const bits = new Int8Array(bitCount);

for (let bitOffset = 0; bitOffset < bits.length; bitOffset++) {
const byteOffset = (bitOffset / 8) | 0;
const byte = this.buffer[byteOffset + 2];
baileympearson marked this conversation as resolved.
Show resolved Hide resolved
const shift = 7 - (bitOffset % 8);
const bit = (byte >> shift) & 1;
bits[bitOffset] = bit;
}

return bits;
}

/**
* Constructs a Binary representing an Int8 Vector.
* @param array - The array to store as a view on the Binary class
*/
public static fromInt8Array(array: Int8Array): Binary {
const buffer = ByteUtils.allocate(array.byteLength + 2);
buffer[0] = Binary.VECTOR_TYPE.Int8;
buffer[1] = 0;
const intBytes = new Uint8Array(array.buffer, array.byteOffset, array.byteLength);
buffer.set(intBytes, 2);
return new this(buffer, this.SUBTYPE_VECTOR);
}

/** Constructs a Binary representing an Float32 Vector. */
public static fromFloat32Array(array: Float32Array): Binary {
const binaryBytes = ByteUtils.allocate(array.byteLength + 2);
binaryBytes[0] = Binary.VECTOR_TYPE.Float32;
binaryBytes[1] = 0;

const floatBytes = new Uint8Array(array.buffer, array.byteOffset, array.byteLength);
binaryBytes.set(floatBytes, 2);

if (NumberUtils.isBigEndian) ByteUtils.swap32(new Uint8Array(binaryBytes.buffer, 2));

return new this(binaryBytes, this.SUBTYPE_VECTOR);
}

/**
* Constructs a Binary representing a packed bit Vector.
*
* Use `fromBits` to pack an array of 1s and 0s.
*/
public static fromPackedBits(array: Uint8Array, padding = 0): Binary {
const buffer = ByteUtils.allocate(array.byteLength + 2);
buffer[0] = Binary.VECTOR_TYPE.PackedBit;
buffer[1] = padding;
buffer.set(array, 2);
return new this(buffer, this.SUBTYPE_VECTOR);
}

/**
* Constructs a Binary representing an Packed Bit Vector.
* @param array - The array of 1s and 0s to pack into the Binary instance
*/
public static fromBits(bits: ArrayLike<number>): Binary {
const byteLength = (bits.length + 7) >>> 3; // ceil(bits.length / 8)
const bytes = new Uint8Array(byteLength + 2);
bytes[0] = Binary.VECTOR_TYPE.PackedBit;

const remainder = bits.length % 8;
bytes[1] = remainder === 0 ? 0 : 8 - remainder;

for (let bitOffset = 0; bitOffset < bits.length; bitOffset++) {
const byteOffset = bitOffset >>> 3; // floor(bitOffset / 8)
const bit = bits[bitOffset];

if (bit !== 0 && bit !== 1) {
throw new BSONError(
`Invalid bit value at ${bitOffset}: must be 0 or 1, found ${bits[bitOffset]}`
);
}
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved

if (bit === 0) continue;

const shift = 7 - (bitOffset % 8);
bytes[byteOffset + 2] |= bit << shift;
}

return new this(bytes, Binary.SUBTYPE_VECTOR);
}
}

export function validateBinaryVector(vector: Binary): void {
if (vector.sub_type !== Binary.SUBTYPE_VECTOR) return;

const size = vector.position;

// NOTE: Validation is only applied to **KNOWN** vector types
// If a new datatype is introduced, a future version of the library will need to add validation
const datatype = vector.buffer[0];

// NOTE: We do not enable noUncheckedIndexedAccess so TS believes this is always number
// a Binary vector may be empty, in which case the padding is undefined
// this possible value is tolerable for our validation checks
const padding: number | undefined = vector.buffer[1];

if (
(datatype === Binary.VECTOR_TYPE.Float32 || datatype === Binary.VECTOR_TYPE.Int8) &&
padding !== 0
) {
throw new BSONError('Invalid Vector: padding must be zero for int8 and float32 vectors');
}

if (datatype === Binary.VECTOR_TYPE.PackedBit && padding !== 0 && size === 2) {
throw new BSONError(
'Invalid Vector: padding must be zero for packed bit vectors that are empty'
);
}

if (datatype === Binary.VECTOR_TYPE.PackedBit && padding > 7) {
throw new BSONError(
`Invalid Vector: padding must be a value between 0 and 7. found: ${padding}`
);
}
}

/** @public */
Expand Down
6 changes: 5 additions & 1 deletion src/parser/serializer.ts
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { Binary } from '../binary';
import { Binary, validateBinaryVector } from '../binary';
import type { BSONSymbol, DBRef, Document, MaxKey } from '../bson';
import type { Code } from '../code';
import * as constants from '../constants';
Expand Down Expand Up @@ -495,6 +495,10 @@ function serializeBinary(buffer: Uint8Array, key: string, value: Binary, index:
index += NumberUtils.setInt32LE(buffer, index, size);
}

if (value.sub_type === Binary.SUBTYPE_VECTOR) {
validateBinaryVector(value);
}

if (size <= 16) {
for (let i = 0; i < size; i++) buffer[index + i] = data[i];
} else {
Expand Down
2 changes: 2 additions & 0 deletions src/utils/byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,8 @@ export type ByteUtils = {
encodeUTF8Into: (destination: Uint8Array, source: string, byteOffset: number) => number;
/** Generate a Uint8Array filled with random bytes with byteLength */
randomBytes: (byteLength: number) => Uint8Array;
/** Interprets `buffer` as an array of 32-bit values and swaps the byte order in-place. */
swap32: (buffer: Uint8Array) => Uint8Array;
};

declare const Buffer: { new (): unknown; prototype?: { _isBuffer?: boolean } } | undefined;
Expand Down
7 changes: 6 additions & 1 deletion src/utils/node_byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ type NodeJsBuffer = ArrayBufferView &
copy(target: Uint8Array, targetStart: number, sourceStart: number, sourceEnd: number): number;
toString: (this: Uint8Array, encoding: NodeJsEncoding, start?: number, end?: number) => string;
equals: (this: Uint8Array, other: Uint8Array) => boolean;
swap32: (this: NodeJsBuffer) => NodeJsBuffer;
};
type NodeJsBufferConstructor = Omit<Uint8ArrayConstructor, 'from'> & {
alloc: (size: number) => NodeJsBuffer;
Expand Down Expand Up @@ -159,5 +160,9 @@ export const nodeJsByteUtils = {
return nodeJsByteUtils.toLocalBufferType(buffer).write(source, byteOffset, undefined, 'utf8');
},

randomBytes: nodejsRandomBytes
randomBytes: nodejsRandomBytes,

swap32(buffer: Uint8Array): NodeJsBuffer {
return nodeJsByteUtils.toLocalBufferType(buffer).swap32();
nbbeeken marked this conversation as resolved.
Show resolved Hide resolved
}
};
4 changes: 4 additions & 0 deletions src/utils/number_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,8 @@ const isBigEndian = FLOAT_BYTES[7] === 0;
* A collection of functions that get or set various numeric types and bit widths from a Uint8Array.
*/
export type NumberUtils = {
/** Is true if the current system is big endian. */
isBigEndian: boolean;
/**
* Parses a signed int32 at offset. Throws a `RangeError` if value is negative.
*/
Expand All @@ -35,6 +37,8 @@ export type NumberUtils = {
* @public
*/
export const NumberUtils: NumberUtils = {
isBigEndian,

getNonnegativeInt32LE(source: Uint8Array, offset: number): number {
if (source[offset + 3] > 127) {
throw new RangeError(`Size cannot be negative at offset: ${offset}`);
Expand Down
21 changes: 20 additions & 1 deletion src/utils/web_byte_utils.ts
Original file line number Diff line number Diff line change
Expand Up @@ -193,5 +193,24 @@ export const webByteUtils = {
return bytes.byteLength;
},

randomBytes: webRandomBytes
randomBytes: webRandomBytes,

swap32(buffer: Uint8Array): Uint8Array {
if (buffer.length % 4 !== 0) {
throw new RangeError('Buffer size must be a multiple of 32-bits');
}

for (let i = 0; i < buffer.length; i += 4) {
const byte0 = buffer[i];
const byte1 = buffer[i + 1];
const byte2 = buffer[i + 2];
const byte3 = buffer[i + 3];
buffer[i] = byte3;
buffer[i + 1] = byte2;
buffer[i + 2] = byte1;
buffer[i + 3] = byte0;
}

return buffer;
}
};
Loading