Skip to content

Commit

Permalink
add tokenize to node client api
Browse files Browse the repository at this point in the history
  • Loading branch information
jraymakers committed Oct 12, 2023
1 parent 21405f2 commit ead6334
Show file tree
Hide file tree
Showing 6 changed files with 163 additions and 9 deletions.
16 changes: 16 additions & 0 deletions lib/duckdb.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,20 @@ export type ReplacementScanCallback = (
table: string
) => ReplacementScanResult | null;

export enum TokenType {
IDENTIFIER = 0,
NUMERIC_CONSTANT = 1,
STRING_CONSTANT = 2,
OPERATOR = 3,
KEYWORD = 4,
COMMENT = 5,
}

export interface ScriptTokens {
offsets: number[];
types: TokenType[];
}

export class Database {
constructor(path: string, accessMode?: number | Record<string,string>, callback?: Callback<any>);
constructor(path: string, callback?: Callback<any>);
Expand Down Expand Up @@ -169,6 +183,8 @@ export class Database {
registerReplacementScan(
replacementScan: ReplacementScanCallback
): Promise<void>;

tokenize(text: string): ScriptTokens;
}

export type GenericTypeInfo = {
Expand Down
12 changes: 12 additions & 0 deletions lib/duckdb.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ var Statement = duckdb.Statement;
* @class
*/
var QueryResult = duckdb.QueryResult;
/**
* Types of tokens return by `tokenize`.
*/
var TokenType = duckdb.TokenType;

/**
* @method
Expand Down Expand Up @@ -631,6 +635,14 @@ Database.prototype.unregister_udf = function () {

Database.prototype.registerReplacementScan;

/**
* Return positions and types of tokens in given text
* @method
* @arg text
* @return {ScriptTokens}
*/
Database.prototype.tokenize;

/**
* Not implemented
*/
Expand Down
31 changes: 30 additions & 1 deletion src/database.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "duckdb/parser/expression/constant_expression.hpp"
#include "duckdb/parser/expression/function_expression.hpp"
#include "duckdb/parser/parser.hpp"
#include "duckdb/parser/tableref/table_function_ref.hpp"
#include "duckdb/storage/buffer_manager.hpp"
#include "duckdb_node.hpp"
Expand All @@ -18,7 +19,8 @@ Napi::FunctionReference Database::Init(Napi::Env env, Napi::Object exports) {
{InstanceMethod("close_internal", &Database::Close), InstanceMethod("wait", &Database::Wait),
InstanceMethod("serialize", &Database::Serialize), InstanceMethod("parallelize", &Database::Parallelize),
InstanceMethod("connect", &Database::Connect), InstanceMethod("interrupt", &Database::Interrupt),
InstanceMethod("registerReplacementScan", &Database::RegisterReplacementScan)});
InstanceMethod("registerReplacementScan", &Database::RegisterReplacementScan),
InstanceMethod("tokenize", &Database::Tokenize)});

exports.Set("Database", t);

Expand Down Expand Up @@ -364,4 +366,31 @@ Napi::Value Database::RegisterReplacementScan(const Napi::CallbackInfo &info) {
return deferred.Promise();
}

Napi::Value Database::Tokenize(const Napi::CallbackInfo &info) {
auto env = info.Env();

if (info.Length() < 1) {
throw Napi::TypeError::New(env, "Text argument expected");
}

std::string text = info[0].As<Napi::String>();

auto tokens = duckdb::Parser::Tokenize(text);
auto numTokens = tokens.size();

auto offsets = Napi::Array::New(env, numTokens);
auto types = Napi::Array::New(env, numTokens);

for (size_t i = 0; i < numTokens; i++) {
auto token = tokens[i];
offsets.Set(i, token.start);
types.Set(i, (uint8_t)token.type);
}

auto result = Napi::Object::New(env);
result.Set("offsets", offsets);
result.Set("types", types);
return result;
}

} // namespace node_duckdb
37 changes: 29 additions & 8 deletions src/duckdb_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,36 @@ NodeDuckDB::NodeDuckDB(Napi::Env env, Napi::Object exports) {
statement_constructor = node_duckdb::Statement::Init(env, exports);
query_result_constructor = node_duckdb::QueryResult::Init(env, exports);

exports.DefineProperties({
DEFINE_CONSTANT_INTEGER(exports, node_duckdb::Database::DUCKDB_NODEJS_ERROR, ERROR) DEFINE_CONSTANT_INTEGER(
auto token_type_enum = Napi::Object::New(env);

token_type_enum.Set("IDENTIFIER", 0);
token_type_enum.Set("NUMERIC_CONSTANT", 1);
token_type_enum.Set("STRING_CONSTANT", 2);
token_type_enum.Set("OPERATOR", 3);
token_type_enum.Set("KEYWORD", 4);
token_type_enum.Set("COMMENT", 5);

// TypeScript enums expose an inverse mapping.
token_type_enum.Set((uint32_t)0, "IDENTIFIER");
token_type_enum.Set((uint32_t)1, "NUMERIC_CONSTANT");
token_type_enum.Set((uint32_t)2, "STRING_CONSTANT");
token_type_enum.Set((uint32_t)3, "OPERATOR");
token_type_enum.Set((uint32_t)4, "KEYWORD");
token_type_enum.Set((uint32_t)5, "COMMENT");

token_type_enum_ref = Napi::ObjectReference::New(token_type_enum);

exports.DefineProperties(
{DEFINE_CONSTANT_INTEGER(exports, node_duckdb::Database::DUCKDB_NODEJS_ERROR, ERROR) DEFINE_CONSTANT_INTEGER(
exports, node_duckdb::Database::DUCKDB_NODEJS_READONLY, OPEN_READONLY) // same as SQLite
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_READWRITE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_CREATE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_FULLMUTEX) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_SHAREDCACHE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_PRIVATECACHE) // ignored
});
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_READWRITE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_CREATE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_FULLMUTEX) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_SHAREDCACHE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_PRIVATECACHE) // ignored

Napi::PropertyDescriptor::Value("TokenType", token_type_enum,
static_cast<napi_property_attributes>(napi_enumerable | napi_configurable))});
}

NODE_API_ADDON(NodeDuckDB);
2 changes: 2 additions & 0 deletions src/duckdb_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class NodeDuckDB : public Napi::Addon<NodeDuckDB> {
Napi::FunctionReference connection_constructor;
Napi::FunctionReference statement_constructor;
Napi::FunctionReference query_result_constructor;
Napi::ObjectReference token_type_enum_ref;
};

namespace node_duckdb {
Expand Down Expand Up @@ -109,6 +110,7 @@ class Database : public Napi::ObjectWrap<Database> {
Napi::Value Interrupt(const Napi::CallbackInfo &info);
Napi::Value Close(const Napi::CallbackInfo &info);
Napi::Value RegisterReplacementScan(const Napi::CallbackInfo &info);
Napi::Value Tokenize(const Napi::CallbackInfo &info);

public:
constexpr static int DUCKDB_NODEJS_ERROR = -1;
Expand Down
74 changes: 74 additions & 0 deletions test/tokenize.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import * as assert from 'assert';
import * as duckdb from '..';

describe('tokenize', function () {
it('should return correct tokens for a single statement', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize('select 1;');
assert.deepStrictEqual(output, {
offsets: [0, 7, 8],
types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR]
});
});
it('should return correct tokens for a multiple statements', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize('select 1; select 2;');
assert.deepStrictEqual(output, {
offsets: [0, 7, 8, 10, 17, 18],
types: [
duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR,
duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR
]
});
});
it('should return no tokens for an empty string', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize('');
assert.deepStrictEqual(output, {
offsets: [],
types: []
});
});
it('should handle quoted semicolons in string constants', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize(`select ';';`);
assert.deepStrictEqual(output, {
offsets: [0, 7, 10],
types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.STRING_CONSTANT, duckdb.TokenType.OPERATOR]
});
});
it('should handle quoted semicolons in identifiers', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize(`from ";";`);
assert.deepStrictEqual(output, {
offsets: [0, 5, 8],
types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.IDENTIFIER, duckdb.TokenType.OPERATOR]
});
});
it('should handle comments', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize(`select /* comment */ 1`);
// Note that the tokenizer doesn't return tokens for comments.
assert.deepStrictEqual(output, {
offsets: [0, 21],
types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT]
});
});
it('should handle invalid syntax', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize(`selec 1`);
// The misspelled keyword is scanned as an identifier.
assert.deepStrictEqual(output, {
offsets: [0, 6],
types: [duckdb.TokenType.IDENTIFIER, duckdb.TokenType.NUMERIC_CONSTANT]
});
});
it('should support inverse TokenType mapping', function () {
assert.equal(duckdb.TokenType[duckdb.TokenType.IDENTIFIER], "IDENTIFIER");
assert.equal(duckdb.TokenType[duckdb.TokenType.NUMERIC_CONSTANT], "NUMERIC_CONSTANT");
assert.equal(duckdb.TokenType[duckdb.TokenType.STRING_CONSTANT], "STRING_CONSTANT");
assert.equal(duckdb.TokenType[duckdb.TokenType.OPERATOR], "OPERATOR");
assert.equal(duckdb.TokenType[duckdb.TokenType.KEYWORD], "KEYWORD");
assert.equal(duckdb.TokenType[duckdb.TokenType.COMMENT], "COMMENT");
});
});

0 comments on commit ead6334

Please sign in to comment.