From ead633414ce66e9a926ffc619f0de2a7e7f8ba14 Mon Sep 17 00:00:00 2001 From: Jeff Raymakers Date: Thu, 12 Oct 2023 16:05:39 -0700 Subject: [PATCH] add tokenize to node client api --- lib/duckdb.d.ts | 16 ++++++++++ lib/duckdb.js | 12 +++++++ src/database.cpp | 31 +++++++++++++++++- src/duckdb_node.cpp | 37 +++++++++++++++++----- src/duckdb_node.hpp | 2 ++ test/tokenize.test.ts | 74 +++++++++++++++++++++++++++++++++++++++++++ 6 files changed, 163 insertions(+), 9 deletions(-) create mode 100644 test/tokenize.test.ts diff --git a/lib/duckdb.d.ts b/lib/duckdb.d.ts index 84f7766a..466ce8d3 100644 --- a/lib/duckdb.d.ts +++ b/lib/duckdb.d.ts @@ -127,6 +127,20 @@ export type ReplacementScanCallback = ( table: string ) => ReplacementScanResult | null; +export enum TokenType { + IDENTIFIER = 0, + NUMERIC_CONSTANT = 1, + STRING_CONSTANT = 2, + OPERATOR = 3, + KEYWORD = 4, + COMMENT = 5, +} + +export interface ScriptTokens { + offsets: number[]; + types: TokenType[]; +} + export class Database { constructor(path: string, accessMode?: number | Record, callback?: Callback); constructor(path: string, callback?: Callback); @@ -169,6 +183,8 @@ export class Database { registerReplacementScan( replacementScan: ReplacementScanCallback ): Promise; + + tokenize(text: string): ScriptTokens; } export type GenericTypeInfo = { diff --git a/lib/duckdb.js b/lib/duckdb.js index d29a0706..fc760d2f 100644 --- a/lib/duckdb.js +++ b/lib/duckdb.js @@ -64,6 +64,10 @@ var Statement = duckdb.Statement; * @class */ var QueryResult = duckdb.QueryResult; +/** + * Types of tokens return by `tokenize`. + */ +var TokenType = duckdb.TokenType; /** * @method @@ -631,6 +635,14 @@ Database.prototype.unregister_udf = function () { Database.prototype.registerReplacementScan; +/** + * Return positions and types of tokens in given text + * @method + * @arg text + * @return {ScriptTokens} + */ +Database.prototype.tokenize; + /** * Not implemented */ diff --git a/src/database.cpp b/src/database.cpp index 5f052e55..0dde71c4 100644 --- a/src/database.cpp +++ b/src/database.cpp @@ -1,5 +1,6 @@ #include "duckdb/parser/expression/constant_expression.hpp" #include "duckdb/parser/expression/function_expression.hpp" +#include "duckdb/parser/parser.hpp" #include "duckdb/parser/tableref/table_function_ref.hpp" #include "duckdb/storage/buffer_manager.hpp" #include "duckdb_node.hpp" @@ -18,7 +19,8 @@ Napi::FunctionReference Database::Init(Napi::Env env, Napi::Object exports) { {InstanceMethod("close_internal", &Database::Close), InstanceMethod("wait", &Database::Wait), InstanceMethod("serialize", &Database::Serialize), InstanceMethod("parallelize", &Database::Parallelize), InstanceMethod("connect", &Database::Connect), InstanceMethod("interrupt", &Database::Interrupt), - InstanceMethod("registerReplacementScan", &Database::RegisterReplacementScan)}); + InstanceMethod("registerReplacementScan", &Database::RegisterReplacementScan), + InstanceMethod("tokenize", &Database::Tokenize)}); exports.Set("Database", t); @@ -364,4 +366,31 @@ Napi::Value Database::RegisterReplacementScan(const Napi::CallbackInfo &info) { return deferred.Promise(); } +Napi::Value Database::Tokenize(const Napi::CallbackInfo &info) { + auto env = info.Env(); + + if (info.Length() < 1) { + throw Napi::TypeError::New(env, "Text argument expected"); + } + + std::string text = info[0].As(); + + auto tokens = duckdb::Parser::Tokenize(text); + auto numTokens = tokens.size(); + + auto offsets = Napi::Array::New(env, numTokens); + auto types = Napi::Array::New(env, numTokens); + + for (size_t i = 0; i < numTokens; i++) { + auto token = tokens[i]; + offsets.Set(i, token.start); + types.Set(i, (uint8_t)token.type); + } + + auto result = Napi::Object::New(env); + result.Set("offsets", offsets); + result.Set("types", types); + return result; +} + } // namespace node_duckdb diff --git a/src/duckdb_node.cpp b/src/duckdb_node.cpp index 8ba244e0..22b7f7c2 100644 --- a/src/duckdb_node.cpp +++ b/src/duckdb_node.cpp @@ -12,15 +12,36 @@ NodeDuckDB::NodeDuckDB(Napi::Env env, Napi::Object exports) { statement_constructor = node_duckdb::Statement::Init(env, exports); query_result_constructor = node_duckdb::QueryResult::Init(env, exports); - exports.DefineProperties({ - DEFINE_CONSTANT_INTEGER(exports, node_duckdb::Database::DUCKDB_NODEJS_ERROR, ERROR) DEFINE_CONSTANT_INTEGER( + auto token_type_enum = Napi::Object::New(env); + + token_type_enum.Set("IDENTIFIER", 0); + token_type_enum.Set("NUMERIC_CONSTANT", 1); + token_type_enum.Set("STRING_CONSTANT", 2); + token_type_enum.Set("OPERATOR", 3); + token_type_enum.Set("KEYWORD", 4); + token_type_enum.Set("COMMENT", 5); + + // TypeScript enums expose an inverse mapping. + token_type_enum.Set((uint32_t)0, "IDENTIFIER"); + token_type_enum.Set((uint32_t)1, "NUMERIC_CONSTANT"); + token_type_enum.Set((uint32_t)2, "STRING_CONSTANT"); + token_type_enum.Set((uint32_t)3, "OPERATOR"); + token_type_enum.Set((uint32_t)4, "KEYWORD"); + token_type_enum.Set((uint32_t)5, "COMMENT"); + + token_type_enum_ref = Napi::ObjectReference::New(token_type_enum); + + exports.DefineProperties( + {DEFINE_CONSTANT_INTEGER(exports, node_duckdb::Database::DUCKDB_NODEJS_ERROR, ERROR) DEFINE_CONSTANT_INTEGER( exports, node_duckdb::Database::DUCKDB_NODEJS_READONLY, OPEN_READONLY) // same as SQLite - DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_READWRITE) // ignored - DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_CREATE) // ignored - DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_FULLMUTEX) // ignored - DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_SHAREDCACHE) // ignored - DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_PRIVATECACHE) // ignored - }); + DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_READWRITE) // ignored + DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_CREATE) // ignored + DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_FULLMUTEX) // ignored + DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_SHAREDCACHE) // ignored + DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_PRIVATECACHE) // ignored + + Napi::PropertyDescriptor::Value("TokenType", token_type_enum, + static_cast(napi_enumerable | napi_configurable))}); } NODE_API_ADDON(NodeDuckDB); diff --git a/src/duckdb_node.hpp b/src/duckdb_node.hpp index e3eea68b..614f32cb 100644 --- a/src/duckdb_node.hpp +++ b/src/duckdb_node.hpp @@ -23,6 +23,7 @@ class NodeDuckDB : public Napi::Addon { Napi::FunctionReference connection_constructor; Napi::FunctionReference statement_constructor; Napi::FunctionReference query_result_constructor; + Napi::ObjectReference token_type_enum_ref; }; namespace node_duckdb { @@ -109,6 +110,7 @@ class Database : public Napi::ObjectWrap { Napi::Value Interrupt(const Napi::CallbackInfo &info); Napi::Value Close(const Napi::CallbackInfo &info); Napi::Value RegisterReplacementScan(const Napi::CallbackInfo &info); + Napi::Value Tokenize(const Napi::CallbackInfo &info); public: constexpr static int DUCKDB_NODEJS_ERROR = -1; diff --git a/test/tokenize.test.ts b/test/tokenize.test.ts new file mode 100644 index 00000000..8bd8f4bf --- /dev/null +++ b/test/tokenize.test.ts @@ -0,0 +1,74 @@ +import * as assert from 'assert'; +import * as duckdb from '..'; + +describe('tokenize', function () { + it('should return correct tokens for a single statement', function () { + const db = new duckdb.Database(':memory:'); + const output = db.tokenize('select 1;'); + assert.deepStrictEqual(output, { + offsets: [0, 7, 8], + types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR] + }); + }); + it('should return correct tokens for a multiple statements', function () { + const db = new duckdb.Database(':memory:'); + const output = db.tokenize('select 1; select 2;'); + assert.deepStrictEqual(output, { + offsets: [0, 7, 8, 10, 17, 18], + types: [ + duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR, + duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR + ] + }); + }); + it('should return no tokens for an empty string', function () { + const db = new duckdb.Database(':memory:'); + const output = db.tokenize(''); + assert.deepStrictEqual(output, { + offsets: [], + types: [] + }); + }); + it('should handle quoted semicolons in string constants', function () { + const db = new duckdb.Database(':memory:'); + const output = db.tokenize(`select ';';`); + assert.deepStrictEqual(output, { + offsets: [0, 7, 10], + types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.STRING_CONSTANT, duckdb.TokenType.OPERATOR] + }); + }); + it('should handle quoted semicolons in identifiers', function () { + const db = new duckdb.Database(':memory:'); + const output = db.tokenize(`from ";";`); + assert.deepStrictEqual(output, { + offsets: [0, 5, 8], + types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.IDENTIFIER, duckdb.TokenType.OPERATOR] + }); + }); + it('should handle comments', function () { + const db = new duckdb.Database(':memory:'); + const output = db.tokenize(`select /* comment */ 1`); + // Note that the tokenizer doesn't return tokens for comments. + assert.deepStrictEqual(output, { + offsets: [0, 21], + types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT] + }); + }); + it('should handle invalid syntax', function () { + const db = new duckdb.Database(':memory:'); + const output = db.tokenize(`selec 1`); + // The misspelled keyword is scanned as an identifier. + assert.deepStrictEqual(output, { + offsets: [0, 6], + types: [duckdb.TokenType.IDENTIFIER, duckdb.TokenType.NUMERIC_CONSTANT] + }); + }); + it('should support inverse TokenType mapping', function () { + assert.equal(duckdb.TokenType[duckdb.TokenType.IDENTIFIER], "IDENTIFIER"); + assert.equal(duckdb.TokenType[duckdb.TokenType.NUMERIC_CONSTANT], "NUMERIC_CONSTANT"); + assert.equal(duckdb.TokenType[duckdb.TokenType.STRING_CONSTANT], "STRING_CONSTANT"); + assert.equal(duckdb.TokenType[duckdb.TokenType.OPERATOR], "OPERATOR"); + assert.equal(duckdb.TokenType[duckdb.TokenType.KEYWORD], "KEYWORD"); + assert.equal(duckdb.TokenType[duckdb.TokenType.COMMENT], "COMMENT"); + }); +});