Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add tokenize to node client api #11

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletions lib/duckdb.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,20 @@ export type ReplacementScanCallback = (
table: string
) => ReplacementScanResult | null;

export enum TokenType {
IDENTIFIER = 0,
NUMERIC_CONSTANT = 1,
STRING_CONSTANT = 2,
OPERATOR = 3,
KEYWORD = 4,
COMMENT = 5,
}

export interface ScriptTokens {
offsets: number[];
types: TokenType[];
}

export class Database {
constructor(path: string, accessMode?: number | Record<string,string>, callback?: Callback<any>);
constructor(path: string, callback?: Callback<any>);
Expand Down Expand Up @@ -169,6 +183,8 @@ export class Database {
registerReplacementScan(
replacementScan: ReplacementScanCallback
): Promise<void>;

tokenize(text: string): ScriptTokens;
}

export type GenericTypeInfo = {
Expand Down
12 changes: 12 additions & 0 deletions lib/duckdb.js
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,10 @@ var Statement = duckdb.Statement;
* @class
*/
var QueryResult = duckdb.QueryResult;
/**
* Types of tokens return by `tokenize`.
*/
var TokenType = duckdb.TokenType;

/**
* @method
Expand Down Expand Up @@ -631,6 +635,14 @@ Database.prototype.unregister_udf = function () {

Database.prototype.registerReplacementScan;

/**
* Return positions and types of tokens in given text
* @method
* @arg text
* @return {ScriptTokens}
*/
Database.prototype.tokenize;

/**
* Not implemented
*/
Expand Down
31 changes: 30 additions & 1 deletion src/database.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#include "duckdb/parser/expression/constant_expression.hpp"
#include "duckdb/parser/expression/function_expression.hpp"
#include "duckdb/parser/parser.hpp"
#include "duckdb/parser/tableref/table_function_ref.hpp"
#include "duckdb/storage/buffer_manager.hpp"
#include "duckdb_node.hpp"
Expand All @@ -18,7 +19,8 @@ Napi::FunctionReference Database::Init(Napi::Env env, Napi::Object exports) {
{InstanceMethod("close_internal", &Database::Close), InstanceMethod("wait", &Database::Wait),
InstanceMethod("serialize", &Database::Serialize), InstanceMethod("parallelize", &Database::Parallelize),
InstanceMethod("connect", &Database::Connect), InstanceMethod("interrupt", &Database::Interrupt),
InstanceMethod("registerReplacementScan", &Database::RegisterReplacementScan)});
InstanceMethod("registerReplacementScan", &Database::RegisterReplacementScan),
InstanceMethod("tokenize", &Database::Tokenize)});

exports.Set("Database", t);

Expand Down Expand Up @@ -364,4 +366,31 @@ Napi::Value Database::RegisterReplacementScan(const Napi::CallbackInfo &info) {
return deferred.Promise();
}

Napi::Value Database::Tokenize(const Napi::CallbackInfo &info) {
auto env = info.Env();

if (info.Length() < 1) {
throw Napi::TypeError::New(env, "Text argument expected");
}

std::string text = info[0].As<Napi::String>();

auto tokens = duckdb::Parser::Tokenize(text);
auto numTokens = tokens.size();

auto offsets = Napi::Array::New(env, numTokens);
auto types = Napi::Array::New(env, numTokens);

for (size_t i = 0; i < numTokens; i++) {
auto token = tokens[i];
offsets.Set(i, token.start);
types.Set(i, (uint8_t)token.type);
}

auto result = Napi::Object::New(env);
result.Set("offsets", offsets);
result.Set("types", types);
return result;
}

} // namespace node_duckdb
37 changes: 29 additions & 8 deletions src/duckdb_node.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,15 +12,36 @@ NodeDuckDB::NodeDuckDB(Napi::Env env, Napi::Object exports) {
statement_constructor = node_duckdb::Statement::Init(env, exports);
query_result_constructor = node_duckdb::QueryResult::Init(env, exports);

exports.DefineProperties({
DEFINE_CONSTANT_INTEGER(exports, node_duckdb::Database::DUCKDB_NODEJS_ERROR, ERROR) DEFINE_CONSTANT_INTEGER(
auto token_type_enum = Napi::Object::New(env);

token_type_enum.Set("IDENTIFIER", 0);
token_type_enum.Set("NUMERIC_CONSTANT", 1);
token_type_enum.Set("STRING_CONSTANT", 2);
token_type_enum.Set("OPERATOR", 3);
token_type_enum.Set("KEYWORD", 4);
token_type_enum.Set("COMMENT", 5);

// TypeScript enums expose an inverse mapping.
token_type_enum.Set((uint32_t)0, "IDENTIFIER");
token_type_enum.Set((uint32_t)1, "NUMERIC_CONSTANT");
token_type_enum.Set((uint32_t)2, "STRING_CONSTANT");
token_type_enum.Set((uint32_t)3, "OPERATOR");
token_type_enum.Set((uint32_t)4, "KEYWORD");
token_type_enum.Set((uint32_t)5, "COMMENT");

token_type_enum_ref = Napi::ObjectReference::New(token_type_enum);

exports.DefineProperties(
{DEFINE_CONSTANT_INTEGER(exports, node_duckdb::Database::DUCKDB_NODEJS_ERROR, ERROR) DEFINE_CONSTANT_INTEGER(
exports, node_duckdb::Database::DUCKDB_NODEJS_READONLY, OPEN_READONLY) // same as SQLite
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_READWRITE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_CREATE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_FULLMUTEX) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_SHAREDCACHE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_PRIVATECACHE) // ignored
});
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_READWRITE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_CREATE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_FULLMUTEX) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_SHAREDCACHE) // ignored
DEFINE_CONSTANT_INTEGER(exports, 0, OPEN_PRIVATECACHE) // ignored

Napi::PropertyDescriptor::Value("TokenType", token_type_enum,
static_cast<napi_property_attributes>(napi_enumerable | napi_configurable))});
}

NODE_API_ADDON(NodeDuckDB);
2 changes: 2 additions & 0 deletions src/duckdb_node.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ class NodeDuckDB : public Napi::Addon<NodeDuckDB> {
Napi::FunctionReference connection_constructor;
Napi::FunctionReference statement_constructor;
Napi::FunctionReference query_result_constructor;
Napi::ObjectReference token_type_enum_ref;
};

namespace node_duckdb {
Expand Down Expand Up @@ -109,6 +110,7 @@ class Database : public Napi::ObjectWrap<Database> {
Napi::Value Interrupt(const Napi::CallbackInfo &info);
Napi::Value Close(const Napi::CallbackInfo &info);
Napi::Value RegisterReplacementScan(const Napi::CallbackInfo &info);
Napi::Value Tokenize(const Napi::CallbackInfo &info);

public:
constexpr static int DUCKDB_NODEJS_ERROR = -1;
Expand Down
74 changes: 74 additions & 0 deletions test/tokenize.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import * as assert from 'assert';
import * as duckdb from '..';

describe('tokenize', function () {
it('should return correct tokens for a single statement', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize('select 1;');
assert.deepStrictEqual(output, {
offsets: [0, 7, 8],
types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR]
});
});
it('should return correct tokens for a multiple statements', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize('select 1; select 2;');
assert.deepStrictEqual(output, {
offsets: [0, 7, 8, 10, 17, 18],
types: [
duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR,
duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT, duckdb.TokenType.OPERATOR
]
});
});
it('should return no tokens for an empty string', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize('');
assert.deepStrictEqual(output, {
offsets: [],
types: []
});
});
it('should handle quoted semicolons in string constants', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize(`select ';';`);
assert.deepStrictEqual(output, {
offsets: [0, 7, 10],
types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.STRING_CONSTANT, duckdb.TokenType.OPERATOR]
});
});
it('should handle quoted semicolons in identifiers', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize(`from ";";`);
assert.deepStrictEqual(output, {
offsets: [0, 5, 8],
types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.IDENTIFIER, duckdb.TokenType.OPERATOR]
});
});
it('should handle comments', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize(`select /* comment */ 1`);
// Note that the tokenizer doesn't return tokens for comments.
assert.deepStrictEqual(output, {
offsets: [0, 21],
types: [duckdb.TokenType.KEYWORD, duckdb.TokenType.NUMERIC_CONSTANT]
});
});
it('should handle invalid syntax', function () {
const db = new duckdb.Database(':memory:');
const output = db.tokenize(`selec 1`);
// The misspelled keyword is scanned as an identifier.
assert.deepStrictEqual(output, {
offsets: [0, 6],
types: [duckdb.TokenType.IDENTIFIER, duckdb.TokenType.NUMERIC_CONSTANT]
});
});
it('should support inverse TokenType mapping', function () {
assert.equal(duckdb.TokenType[duckdb.TokenType.IDENTIFIER], "IDENTIFIER");
assert.equal(duckdb.TokenType[duckdb.TokenType.NUMERIC_CONSTANT], "NUMERIC_CONSTANT");
assert.equal(duckdb.TokenType[duckdb.TokenType.STRING_CONSTANT], "STRING_CONSTANT");
assert.equal(duckdb.TokenType[duckdb.TokenType.OPERATOR], "OPERATOR");
assert.equal(duckdb.TokenType[duckdb.TokenType.KEYWORD], "KEYWORD");
assert.equal(duckdb.TokenType[duckdb.TokenType.COMMENT], "COMMENT");
});
});