-
Notifications
You must be signed in to change notification settings - Fork 2
/
h2o.patch
199 lines (196 loc) · 7.01 KB
/
h2o.patch
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
diff --git a/h2o-core/build.gradle b/h2o-core/build.gradle
index 35fe0f9889..b5f972fddf 100644
--- a/h2o-core/build.gradle
+++ b/h2o-core/build.gradle
@@ -32,6 +32,8 @@ dependencies {
exclude group: 'org.mapdb', module: 'mapdb'
}
+ api 'org.codelibs:curl4j:1.2.8'
+
testImplementation project(':h2o-test-support')
testRuntimeOnly project(":${defaultWebserverModule}")
testCompileOnly "javax.servlet:javax.servlet-api:${servletApiVersion}"
diff --git a/h2o-core/src/main/java/hex/ElasticsearchTokenizer.java b/h2o-core/src/main/java/hex/ElasticsearchTokenizer.java
new file mode 100644
index 0000000000..5e25a77285
--- /dev/null
+++ b/h2o-core/src/main/java/hex/ElasticsearchTokenizer.java
@@ -0,0 +1,151 @@
+package hex;
+
+import com.google.gson.Gson;
+import com.google.gson.GsonBuilder;
+import org.apache.commons.lang.StringUtils;
+import org.codelibs.curl.Curl;
+import org.codelibs.curl.CurlResponse;
+import water.MRTask;
+import water.fvec.Chunk;
+import water.fvec.Frame;
+import water.fvec.NewChunk;
+import water.fvec.Vec;
+import water.parser.BufferedString;
+
+import java.io.BufferedReader;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.nio.charset.StandardCharsets;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.List;
+import java.util.Map;
+
+/**
+ * RegexTokenizer splits rows of a given Frame into delimited sequences of tokens using a regular expression.
+ * The output structure is suitable for use in the word2vec algorithm.
+ *
+ * <p>
+ * Example usage:
+ * <pre>{@code
+ * final RegexTokenizer tokenizer = new RegexTokenizer.Builder()
+ * .setRegex("[,;]")
+ * .setMinLength(2)
+ * .setToLowercase(true)
+ * .create();
+ * final Frame tokens = tokenizer.transform(inputFrame);
+ * }
+ * </pre>
+ */
+public class ElasticsearchTokenizer extends MRTask<ElasticsearchTokenizer> {
+ private final String url;
+ private final int minLength;
+ private final String analyzer;
+
+ public ElasticsearchTokenizer(String url) {
+ this(url, 0);
+ }
+
+ private ElasticsearchTokenizer(String url, int minLength) {
+ String[] values = StringUtils.split(url, "?", 2);
+ if (values.length == 2) {
+ this.url = values[0];
+ final Map<String, String> params = new HashMap<>();
+ for (String value : values[1].split("&")) {
+ String[] pair = StringUtils.split(value, "=", 2);
+ if (pair.length == 2) {
+ params.put(pair[0], pair[1]);
+ } else {
+ params.put(value, "");
+ }
+ }
+ this.analyzer = params.getOrDefault("analyzer", null);
+ } else {
+ this.url = url;
+ this.analyzer = null;
+ }
+ this.minLength = minLength;
+ }
+
+ @Override
+ public void map(Chunk[] cs, NewChunk nc) {
+ final Gson gson = new GsonBuilder().create();
+ final BufferedString tmpStr = new BufferedString();
+ for (int row = 0; row < cs[0]._len; row++) {
+ for (Chunk chk : cs) {
+ if (chk.isNA(row))
+ continue; // input NAs are skipped
+ final String str = chk.atStr(tmpStr, row).toString();
+ final String[] ss = tokenize(gson, str);
+ for (String s : ss) {
+ if (s.length() >= minLength) {
+ nc.addStr(s);
+ }
+ }
+ }
+ nc.addNA(); // sequences of tokens are delimited by NAs
+ }
+ }
+
+ private String[] tokenize(final Gson gson, final String str) {
+ final Map<String, String> params = new HashMap<>();
+ params.put("text", str);
+ if (analyzer != null) {
+ params.put("analyzer", analyzer);
+ }
+ final String json = gson.toJson(params);
+ try (CurlResponse response = Curl.post(url).header("Content-Type", "application/json").body(json).execute()) {
+ final Tokens tokens = response.getContent(res -> {
+ try (BufferedReader reader = new BufferedReader(new InputStreamReader(res.getContentAsStream(), StandardCharsets.UTF_8))) {
+ return gson.fromJson(reader, Tokens.class);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ });
+ return tokens.tokens.stream().map(t -> t.token).toArray(String[]::new);
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ }
+
+ /**
+ * Tokenizes a given Frame
+ * @param input Input Frame is expected to only contain String columns. Each row of the Frame represents a logical
+ * sentence. The sentence can span one or more cells of the row.
+ * @return Frame made of a single String column where original sentences are split into tokens and delimited by NAs.
+ */
+ public Frame transform(Frame input) {
+ return doAll(Vec.T_STR, input).outputFrame();
+ }
+
+ public static class Builder {
+ private String url;
+ private int minLength;
+
+ public Builder setUrl(String regex) {
+ this.url = regex;
+ return this;
+ }
+
+ public Builder setMinLength(int minLength) {
+ this.minLength = minLength;
+ return this;
+ }
+
+ public ElasticsearchTokenizer create() {
+ return new ElasticsearchTokenizer(url, minLength);
+ }
+ }
+
+ static class Tokens {
+ public List<Token> tokens = new ArrayList<>();
+ }
+
+ static class Token {
+ public String token;
+ public int start_offset;
+ public int end_offset;
+ public String type;
+ public int position;
+ }
+}
diff --git a/h2o-core/src/main/java/water/rapids/ast/prims/string/AstTokenize.java b/h2o-core/src/main/java/water/rapids/ast/prims/string/AstTokenize.java
index 61d13a2380..e4c96d7a56 100644
--- a/h2o-core/src/main/java/water/rapids/ast/prims/string/AstTokenize.java
+++ b/h2o-core/src/main/java/water/rapids/ast/prims/string/AstTokenize.java
@@ -1,5 +1,8 @@
package water.rapids.ast.prims.string;
+import org.apache.commons.lang.StringUtils;
+
+import hex.ElasticsearchTokenizer;
import hex.RegexTokenizer;
import water.MRTask;
import water.fvec.*;
@@ -36,6 +39,15 @@ public class AstTokenize extends AstPrimitive {
throw new IllegalArgumentException("tokenize() requires all input columns to be of a String type. "
+ "Received " + fr.anyVec().get_type_str() + ". Please convert column to a string column first.");
+ if (regex.startsWith("tokenize:")) {
+ final String[] values = StringUtils.split(regex, ":", 3);
+ if(values.length == 3) {
+ if("elasticsearch".equals(values[1])) {
+ Frame tokenized = new ElasticsearchTokenizer(values[2]).transform(fr);
+ return new ValFrame(tokenized);
+ }
+ }
+ }
Frame tokenized = new RegexTokenizer(regex).transform(fr);
return new ValFrame(tokenized);
}