forked from macournoyer/neuralconvo
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tokenizer.lua
55 lines (45 loc) · 1.03 KB
/
tokenizer.lua
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
local lexer = require "pl.lexer"
local yield = coroutine.yield
local M = {}
local function word(token)
return yield("word", token)
end
local function quote(token)
return yield("quote", token)
end
local function space(token)
return yield("space", token)
end
local function tag(token)
return yield("tag", token)
end
local function punct(token)
return yield("punct", token)
end
local function endpunct(token)
return yield("endpunct", token)
end
local function unknown(token)
return yield("unknown", token)
end
function M.tokenize(text)
return lexer.scan(text, {
{ "^%s+", space },
{ "^['\"]", quote },
{ "^%w+", word },
{ "^%-+", space },
{ "^[,:;%-]", punct },
{ "^%.+", endpunct },
{ "^[%.%?!]", endpunct },
{ "^</?.->", tag },
{ "^.", unknown },
}, { [space]=true, [tag]=true })
end
function M.join(words)
local s = table.concat(words, " ")
s = s:gsub("^%l", string.upper)
s = s:gsub(" (') ", "%1")
s = s:gsub(" ([,:;%-%.%?!])", "%1")
return s
end
return M