rapidsai · thomcom · Jun 24, 2022 · Apr 26, 2022 · Apr 26, 2022 · May 2, 2022
diff --git a/dev/dockerfiles/devel/main.Dockerfile b/dev/dockerfiles/devel/main.Dockerfile
@@ -322,7 +322,7 @@ RUN --mount=type=cache,target=/var/lib/apt \
  \
  && apt update \
  && apt install --no-install-recommends -y \
-    jq entr ssh vim nano sudo bash-completion \
+    jq entr ssh vim nano sudo less bash-completion \
     # X11 dependencies
     libxi-dev libxrandr-dev libxinerama-dev libxcursor-dev \
     # node-canvas dependencies

diff --git a/modules/cudf/src/column.cpp b/modules/cudf/src/column.cpp
@@ -148,6 +148,9 @@ Napi::Function Column::Init(Napi::Env const& env, Napi::Object exports) {
                        InstanceMethod<&Column::matches_re>("matchesRe"),
                        // column/strings/json.cpp
                        InstanceMethod<&Column::get_json_object>("getJSONObject"),
+                       // io/text/multibyte_split.cpp
+                       StaticMethod<&Column::read_text>("read_text"),
+                       InstanceMethod<&Column::split>("split"),
                        // column/strings/padding.cpp
                        InstanceMethod<&Column::pad>("pad"),
                        InstanceMethod<&Column::zfill>("zfill"),

diff --git a/modules/cudf/src/column.ts b/modules/cudf/src/column.ts
@@ -123,6 +123,20 @@ export interface ColumnConstructor {
                                init: Scalar<U>,
                                step: Scalar<U>,
                                memoryResource?: MemoryResource): Column<U>;
+
+  /**
+   * Fills a column with the Utf-8 string located at filepath. If a delimiter is included then
+   * the input string will be split into a sequence of strings. The delimiter will remain
+   * at the end of each string in the column, except for the last. If no delimiter is included,
+   * the input string will be read into a single string at element 0 of the Colum.
+   *
+   * @param filepath The location of the input file.
+   * @param delimiter Optional delimiter.
+   * @returns column containing one or more strings.
+   *
+   * @note The maximum size of a string read with this method is 2^30
+   */
+  read_text(filepath: string, delimiter: string): Column<Utf8String>;
 }
 
 /**
@@ -1459,6 +1473,26 @@ export interface Column<T extends DataType = any> {
    */
   replaceSlice(repl: string, start: number, stop: number, memoryResource?: MemoryResource):
     Column<Utf8String>;
+
+  /**
+   * Splits a string column by delimiter. The delimiter string will remain at the end of each
+   * string in the split column. This method will completely change the string boundaries of a
+   * string column according to the delimiter: old boundaries will be removed and new boundaries
+   * will be introduced according to the delimiter. In addition, if used without a delimiter,
+   * the string column will be combined from n string values into a single value.
+   *
+   * @example
+   * ```typescript
+   * let a = Series.new(['abcdefg', 'bcdefgh']);
+   * a.split('d');
+   * [ 'abcd', 'efgbcd', 'efgh' ]
+   * a.split('');
+   * [ 'abcdefgbcdefgh' ]
+   * ```
+   * @param delimiter split along the delimiter.
+   * @returns New strings column
+   */
+  split(delimiter: string): Column<Utf8String>;
 }
 
 // eslint-disable-next-line @typescript-eslint/no-redeclare

diff --git a/modules/cudf/src/column/strings/multibyte_split.cpp b/modules/cudf/src/column/strings/multibyte_split.cpp
@@ -0,0 +1,77 @@
+// Copyright (c) 2021, NVIDIA CORPORATION.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
+#include <node_cudf/column.hpp>
+#include <node_cudf/table.hpp>
+#include <node_cudf/utilities/metadata.hpp>
+
+namespace nv {
+
+namespace {
+
+Column::wrapper_t split_string_column(Napi::CallbackInfo const& info,
+                                      cudf::mutable_column_view const& col,
+                                      std::string const& delimiter) {
+  auto env = info.Env();
+  /* TODO: This only splits a string column. How to generalize */
+  // Check type
+  auto span = cudf::device_span<char const>(col.child(1).data<char const>(), col.child(1).size());
+
+  auto datasource = cudf::io::text::device_span_data_chunk_source(span);
+  return Column::New(env, cudf::io::text::multibyte_split(datasource, delimiter));
+}
+
+Column::wrapper_t read_text_files(Napi::CallbackInfo const& info,
+                                  std::string const& filename,
+                                  std::string const& delimiter) {
+  auto datasource = cudf::io::text::make_source_from_file(filename);
+  auto text_data  = cudf::io::text::multibyte_split(*datasource, delimiter);
+  auto env        = info.Env();
+  return Column::New(env, std::move(text_data));
+}
+
+}  // namespace
+
+Napi::Value Column::split(Napi::CallbackInfo const& info) {
+  CallbackArgs args{info};
+
+  if (args.Length() != 1) { NAPI_THROW(Napi::Error::New(info.Env(), "split expects a delimiter")); }
+
+  auto delimiter = args[0];
+  auto col       = this->mutable_view();
+  try {
+    return split_string_column(info, col, delimiter);
+  } catch (cudf::logic_error const& err) { NAPI_THROW(Napi::Error::New(info.Env(), err.what())); }
+}
+
+Napi::Value Column::read_text(Napi::CallbackInfo const& info) {
+  CallbackArgs args{info};
+
+  if (args.Length() != 2) {
+    NAPI_THROW(
+      Napi::Error::New(info.Env(), "read_text expects a filename and an optional delimiter"));
+  }
+
+  auto source    = args[0];
+  auto delimiter = args[1];
+
+  try {
+    return read_text_files(info, source, delimiter);
+
+  } catch (cudf::logic_error const& err) { NAPI_THROW(Napi::Error::New(info.Env(), err.what())); }
+}
+
+}  // namespace nv
diff --git a/modules/cudf/src/node_cudf/column.hpp b/modules/cudf/src/node_cudf/column.hpp
@@ -1033,6 +1033,10 @@ struct Column : public EnvLocalObjectWrap<Column> {
   Napi::Value string_is_ipv4(Napi::CallbackInfo const& info);
   Napi::Value ipv4_from_integers(Napi::CallbackInfo const& info);
   Napi::Value ipv4_to_integers(Napi::CallbackInfo const& info);
+
+  // io/text.hpp
+  static Napi::Value read_text(Napi::CallbackInfo const& info);
+  Napi::Value split(Napi::CallbackInfo const& info);
 };
 
 }  // namespace nv
diff --git a/modules/cudf/src/series.ts b/modules/cudf/src/series.ts
@@ -939,11 +939,11 @@ export class AbstractSeries<T extends DataType = any> {
   /**
    * @summary Return sub-selection from a Series using the specified integral indices.
    *
-   * @description Gathers the rows of the source columns according to `selection`, such that row "i"
-   * in the resulting Series's columns will contain row `selection[i]` from the source columns. The
-   * number of rows in the result series will be equal to the number of elements in selection. A
-   * negative value i in the selection is interpreted as i+n, where `n` is the number of rows in
-   * the source series.
+   * @description Gathers the rows of the source columns according to `selection`, such that row
+   * "i" in the resulting Series's columns will contain row `selection[i]` from the source
+   * columns. The number of rows in the result series will be equal to the number of elements in
+   * selection. A negative value i in the selection is interpreted as i+n, where `n` is the number
+   * of rows in the source series.
    *
    * For dictionary columns, the keys column component is copied and not trimmed if the gather
    * results in abandoned key elements.

diff --git a/modules/cudf/src/series/string.ts b/modules/cudf/src/series/string.ts
@@ -445,6 +445,47 @@ export class StringSeries extends Series<Utf8String> {
     return this.__construct(this._col.zfill(width, memoryResource));
   }
 
+  /**
+   * Resplits a StringSeries along the delimiter.
+   *
+   * @note If delimiter is omitted, the default is ''.
+   *
+   * @param delimiter Optional delimiter.
+   *
+   * @returns Series with new splits determined by the delimiter.
+   *
+   * @example
+   * ```typescript
+   * import {Series} from '@rapidsai/cudf';
+   *
+   * Series.read_text('./inputAsciiFile.txt')
+   * ```
+   */
+  split(delimiter: string): Series<Utf8String> {
+    return this.__construct(this._col.split(delimiter));
+  }
+
+  /**
+   * Constructs a Series with an input filename as source.
+   *
+   * @note If delimiter is omitted, the default is ''.
+   *
+   * @param filepath Path of the input file.
+   * @param delimiter Optional delimiter.
+   *
+   * @returns StringSeries from the file, split by delimiter
+   *
+   * @example
+   * ```typescript
+   * import {Series} from '@rapidsai/cudf';
+   *
+   * const infile = Series.read_text('./inputAsciiFile.txt')
+   * ```
+   */
+  public static read_text(filepath: string, delimiter: string): Series<Utf8String> {
+    return StringSeries.new(Column.read_text(filepath, delimiter ?? ''));
+  }
+
   /**
    * Applies a JSONPath(string) where each row in the series is a valid json string. Returns New
    * StringSeries containing the retrieved json object strings

diff --git a/modules/cudf/test/cudf-column-tests.ts b/modules/cudf/test/cudf-column-tests.ts
@@ -46,7 +46,6 @@ test('Column initialization with null_mask', () => {
     nullMask: new Uint8Buffer(64).fill(0),
   });
 
-  expect(col.type).toBeInstanceOf(Bool8);
   expect(col.length).toBe(length);
   expect(col.nullCount).toBe(100);
   expect(col.hasNulls).toBe(true);

diff --git a/modules/cudf/test/cudf-series-test.ts b/modules/cudf/test/cudf-series-test.ts
@@ -29,6 +29,8 @@ import {
 import {
   Bool8,
   Column,
+  DataFrame,
+  DataType,
   Float32,
   Float32Series,
   Float64,
@@ -42,6 +44,8 @@ import {
   Int8,
   Int8Series,
   Series,
+  SeriesMap,
+  StringSeries,
   TimestampDay,
   TimestampMicrosecond,
   TimestampMillisecond,
@@ -60,6 +64,8 @@ import {
 import {CudaMemoryResource, DeviceBuffer} from '@rapidsai/rmm';
 import {Uint8Vector, Utf8Vector} from 'apache-arrow';
 import {BoolVector} from 'apache-arrow';
+import {promises} from 'fs';
+import * as Path from 'path';
 
 const mr = new CudaMemoryResource();
 
@@ -862,3 +868,137 @@ ${false} | ${false}   | ${false}   | ${[4, null, 1, 2, null, 3, 4]} | ${[1, 2, 3
   const result = s.dropDuplicates(keep, nullsEqual, nullsFirst);
   expect([...result]).toEqual(expected);
 });
+
+/* TODO: How do I apply a list of dtypes?
+ */
+function json_aos_to_dataframe(
+  str: StringSeries, columns: ReadonlyArray<string>, _: ReadonlyArray<DataType>): DataFrame {
+  const arr = {} as SeriesMap;
+  columns.forEach((col, ix) => {
+    const no_open_list = str.split('[\n').gather([1], false);
+    const tokenized    = no_open_list.split('},');
+    console.log(tokenized.toArray());
+    const parse_result = tokenized._col.getJSONObject('.' + columns[ix]);
+    arr[col]           = Series.new(parse_result);
+    console.log(Series.new(parse_result).toArray());
+  });
+  const result = new DataFrame(arr);
+  return result;
+}
+/* TODO: How do I apply a list of dtypes?
+ */
+function json_aoa_to_dataframe(str: StringSeries, dtypes: ReadonlyArray<DataType>): DataFrame {
+  const arr = {} as SeriesMap;
+  dtypes.forEach((_, ix) => {
+    const no_open_list = str.split('[\n').gather([1], false);
+    const tokenized    = no_open_list.split('],');
+    const get_ix       = `[${ix}]`;
+    const parse_result = tokenized._col.getJSONObject(get_ix);
+    arr[ix]            = Series.new(parse_result);
+  });
+  const result = new DataFrame(arr);
+  return result;
+}
+
+describe('Graphology dataset parsing', () => {
+  test('extracts four objects from the base object', () => {
+    const dataset   = StringSeries.read_text('dataset_small.json.txt', '');
+    let split       = dataset.split('"tags":');
+    const ttags     = split.gather([1], false);
+    let rest        = split.gather([0], false);
+    split           = rest.split('"clusters":');
+    const tclusters = split.gather([1], false);
+    rest            = split.gather([0], false);
+    split           = rest.split('"edges":');
+    const tedges    = split.gather([1], false);
+    rest            = split.gather([0], false);
+    split           = rest.split('"nodes":');
+    const tnodes    = split.gather([1], false);
+    const tags = json_aos_to_dataframe(ttags, ['key', 'image'], [new Utf8String, new Utf8String]);
+    const clusters = json_aos_to_dataframe(
+      tclusters, ['key', 'color', 'clusterLabel'], [new Int32, new Utf8String, new Utf8String]);
+    const nodes =
+      json_aos_to_dataframe(tnodes, ['key', 'label', 'tag', 'URL', 'cluster', 'x', 'y', 'score'], [
+        new Utf8String,
+        new Utf8String,
+        new Utf8String,
+        new Utf8String,
+        new Int32,
+        new Float64,
+        new Float64,
+        new Int32
+      ]);
+    const edges = json_aoa_to_dataframe(tedges, [new Utf8String, new Utf8String]);
+    expect(nodes.names).toEqual(['key', 'label', 'tag', 'URL', 'cluster', 'x', 'y', 'score']);
+    expect(nodes.numRows).toEqual(5);
+    expect(edges.numRows).toEqual(11);
+    expect(clusters.names).toEqual(['key', 'color', 'clusterLabel']);
+    expect(clusters.numRows).toEqual(24);
+    expect(tags.names).toEqual(['key', 'image']);
+    expect(tags.numRows).toEqual(11);
+  });
+});
+
+describe('StringSeries.read_text', () => {
+  test('can read a json file', async () => {
+    const rows = [
+      {a: 0, b: 1.0, c: '2'},
+      {a: 1, b: 2.0, c: '3'},
+      {a: 2, b: 3.0, c: '4'},
+    ];
+    const outputString = JSON.stringify(rows);
+    const path         = Path.join(readTextTmpDir, 'simple.txt');
+    await promises.writeFile(path, outputString);
+    const text = StringSeries.read_text(path, '');
+    expect(text.getValue(0)).toEqual(outputString);
+    await new Promise<void>((resolve, reject) =>
+                              rimraf(path, (err?: Error|null) => err ? reject(err) : resolve()));
+  });
+  test('can read a random file', async () => {
+    const outputString = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789!@#$%^&*()';
+    const path         = Path.join(readTextTmpDir, 'simple.txt');
+    await promises.writeFile(path, outputString);
+    const text = StringSeries.read_text(path, '');
+    expect(text.getValue(0)).toEqual(outputString);
+    await new Promise<void>((resolve, reject) =>
+                              rimraf(path, (err?: Error|null) => err ? reject(err) : resolve()));
+  });
+  test('can read an empty file', async () => {
+    const outputString = '';
+    const path         = Path.join(readTextTmpDir, 'simple.txt');
+    await promises.writeFile(path, outputString);
+    const text = StringSeries.read_text(path, '');
+    expect(text.getValue(0)).toEqual(outputString);
+    await new Promise<void>((resolve, reject) =>
+                              rimraf(path, (err?: Error|null) => err ? reject(err) : resolve()));
+  });
+});
+
+describe('StringSeries split', () => {
+  test('split a basic string', () => {
+    const input   = StringSeries.new(['abcdefg']);
+    const example = StringSeries.new(['abcd', 'efg']);
+    const result  = StringSeries.new(input._col.split('d'));
+    expect(result).toEqual(example);
+  });
+  test('split a string twice', () => {
+    const input   = StringSeries.new(['abcdefgdcba']);
+    const example = StringSeries.new(['abcd', 'efgd', 'cba']);
+    const result  = StringSeries.new(input._col.split('d'));
+    expect(result).toEqual(example);
+  });
+});
+
+let readTextTmpDir = '';
+
+const rimraf = require('rimraf');
+
+beforeAll(async () => {  //
+  readTextTmpDir = await promises.mkdtemp(Path.join('/tmp', 'node_cudf'));
+});
+
+afterAll(() => {
+  return new Promise<void>((resolve, reject) => {  //
+    rimraf(readTextTmpDir, (err?: Error|null) => err ? reject(err) : resolve());
+  });
+});