rapidsai · shrshi · Jan 9, 2025 · Jan 9, 2025 · Jan 10, 2025 · Jan 10, 2025
@@ -36,6 +36,7 @@
 #include <rmm/exec_policy.hpp>
 
 #include <thrust/distance.h>
+#include <thrust/execution_policy.h>
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/scatter.h>
 
@@ -372,7 +373,9 @@ table_with_metadata read_json_impl(host_span<std::unique_ptr<datasource>> source
   std::size_t const size_per_subchunk      = estimate_size_per_subchunk(chunk_size);
   std::size_t const batch_size_upper_bound = get_batch_size_upper_bound();
   std::size_t const batch_size =
-    batch_size_upper_bound - (max_subchunks_prealloced * size_per_subchunk);
+    batch_size_upper_bound < (max_subchunks_prealloced * size_per_subchunk)
+      ? batch_size_upper_bound
+      : batch_size_upper_bound - (max_subchunks_prealloced * size_per_subchunk);
 
   /*
    * Identify the position (zero-indexed) of starting source file from which to begin
@@ -421,10 +424,72 @@ table_with_metadata read_json_impl(host_span<std::unique_ptr<datasource>> source
 
   std::vector<cudf::io::table_with_metadata> partial_tables;
   json_reader_options batched_reader_opts{reader_opts};
+
+  // recursive lambda to construct schema_element. Here, we assume that the table from the
+  // first batch contains all the columns in the concatenated table, and that the partial tables
+  // from all following batches contain the same set of columns
+  std::function<schema_element(cudf::host_span<column_view const> cols,
+                               cudf::host_span<column_name_info const> names,
+                               schema_element & schema)>
+    construct_schema;
+  schema_element schema{data_type{cudf::type_id::STRUCT}};
+  construct_schema = [&construct_schema](cudf::host_span<column_view const> children,
+                                         cudf::host_span<column_name_info const> children_props,
+                                         schema_element& schema) -> schema_element {
+    CUDF_EXPECTS(
+      children.size() == children_props.size(),
+      "Mismatch in the number of children columns and children column properties received");
+
+    if (schema.type == data_type{cudf::type_id::LIST}) {
+      schema.column_order = {"element"};
+      CUDF_EXPECTS(children.size() == 2, "List should have two children");
+      auto element_idx = children_props[0].name == "element" ? 0 : 1;
+      schema_element child_schema{children[element_idx].type()};
+      std::vector<column_view> grandchildren_cols;
+      std::transform(children[element_idx].child_begin(),
+                     children[element_idx].child_end(),
+                     std::back_inserter(grandchildren_cols),
+                     [](auto& gc) { return gc; });
+      schema.child_types["element"] =
+        construct_schema(grandchildren_cols, children_props[element_idx].children, child_schema);
+    } else {
+      std::vector<std::string> col_order;
+      std::transform(children_props.begin(),
+                     children_props.end(),
+                     std::back_inserter(col_order),
+                     [](auto& c_prop) { return c_prop.name; });
+      schema.column_order = std::move(col_order);
+      for (auto i = 0ul; i < children.size(); i++) {
+        schema_element child_schema{children[i].type()};
+        std::vector<column_view> grandchildren_cols;
+        std::transform(children[i].child_begin(),
+                       children[i].child_end(),
+                       std::back_inserter(grandchildren_cols),
+                       [](auto& gc) { return gc; });
+        schema.child_types[children_props[i].name] =
+          construct_schema(grandchildren_cols, children_props[i].children, child_schema);
+      }
+    }
+
+    return schema;
+  };
+  batched_reader_opts.set_byte_range_offset(batch_offsets[0]);
+  batched_reader_opts.set_byte_range_size(batch_offsets[1] - batch_offsets[0]);
+  partial_tables.emplace_back(
+    read_batch(sources, batched_reader_opts, stream, cudf::get_current_device_resource_ref()));
+
+  auto& tbl = partial_tables.back().tbl;
+  std::vector<column_view> children;
+  for (size_type j = 0; j < tbl->num_columns(); j++)
+    children.emplace_back(tbl->get_column(j));
+  batched_reader_opts.set_dtypes(
+    construct_schema(children, partial_tables.back().metadata.schema_info, schema));
+  batched_reader_opts.enable_prune_columns(true);
+
   // Dispatch individual batches to read_batch and push the resulting table into
   // partial_tables array. Note that the reader options need to be updated for each
   // batch to adjust byte range offset and byte range size.
-  for (std::size_t i = 0; i < batch_offsets.size() - 1; i++) {
+  for (std::size_t i = 1; i < batch_offsets.size() - 1; i++) {
     batched_reader_opts.set_byte_range_offset(batch_offsets[i]);
     batched_reader_opts.set_byte_range_size(batch_offsets[i + 1] - batch_offsets[i]);
     partial_tables.emplace_back(

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -27,7 +27,15 @@
 #include <cudf/utilities/span.hpp>
 
 struct JsonLargeReaderTest : public cudf::test::StringsLargeTest,
-                             public testing::WithParamInterface<cudf::io::compression_type> {};
+                             public testing::WithParamInterface<cudf::io::compression_type> {
+ public:
+  void set_batch_size(size_t batch_size_upper_bound)
+  {
+    setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
+  }
+
+  ~JsonLargeReaderTest() { unsetenv("LIBCUDF_JSON_BATCH_SIZE"); }
+};
 
 // Parametrize qualifying JSON tests for multiple compression types
 INSTANTIATE_TEST_SUITE_P(JsonLargeReaderTest,
@@ -47,7 +55,7 @@ TEST_P(JsonLargeReaderTest, MultiBatch)
 
   std::size_t const batch_size_upper_bound = std::numeric_limits<int32_t>::max() / 16;
   // set smaller batch_size to reduce file size and execution time
-  setenv("LIBCUDF_JSON_BATCH_SIZE", std::to_string(batch_size_upper_bound).c_str(), 1);
+  this->set_batch_size(batch_size_upper_bound);
 
   constexpr std::size_t expected_file_size = 1.5 * static_cast<double>(batch_size_upper_bound);
   std::size_t const log_repetitions =
@@ -127,7 +135,74 @@ TEST_P(JsonLargeReaderTest, MultiBatch)
     // cannot use EQUAL due to concatenate removing null mask
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
   }
+}
+
+TEST_P(JsonLargeReaderTest, MultiBatchWithNulls)
+{
+  cudf::io::compression_type const comptype = GetParam();
 
-  // go back to normal batch_size
-  unsetenv("LIBCUDF_JSON_BATCH_SIZE");
+  // The goal of this test is to ensure that column schema from the first
+  // batch is enforced on all following batches in the JSON reader. The column
+  // ordering from the first batch is applied to batches 2 and 3.
+  std::string json_string_b1 = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  std::string json_string_b2 = R"(
+    { "a": { "y" : 6}, "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+  std::string json_string_b3 = R"(
+    { "b" : [1, 2, 3], "a": { "y" : 6}}
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+
+  // Set the batch size to the size of the first json string, `json_string_b1`.
+  std::size_t const batch_size_upper_bound = json_string_b1.size();
+  // set smaller batch_size to reduce file size and execution time
+  this->set_batch_size(batch_size_upper_bound);
+
+  auto json_string = json_string_b1 + json_string_b2 + json_string_b3;
+  std::vector<std::uint8_t> cdata;
+  if (comptype != cudf::io::compression_type::NONE) {
+    cdata = cudf::io::detail::compress(
+      comptype,
+      cudf::host_span<uint8_t const>(reinterpret_cast<uint8_t const*>(json_string.data()),
+                                     json_string.size()),
+      cudf::get_default_stream());
+  } else
+    cdata = std::vector<uint8_t>(
+      reinterpret_cast<uint8_t const*>(json_string.data()),
+      reinterpret_cast<uint8_t const*>(json_string.data()) + json_string.size());
+
+  constexpr int num_sources = 2;
+  std::vector<cudf::host_span<std::byte>> hostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(json_string.data()),
+                               json_string.size()));
+  std::vector<cudf::host_span<std::byte>> chostbufs(
+    num_sources,
+    cudf::host_span<std::byte>(reinterpret_cast<std::byte*>(cdata.data()), cdata.size()));
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{
+        cudf::host_span<cudf::host_span<std::byte>>(hostbufs.data(), hostbufs.size())})
+      .lines(true)
+      .compression(cudf::io::compression_type::NONE)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+  cudf::io::json_reader_options cjson_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{
+        cudf::host_span<cudf::host_span<std::byte>>(chostbufs.data(), chostbufs.size())})
+      .lines(true)
+      .compression(comptype)
+      .recovery_mode(cudf::io::json_recovery_mode_t::FAIL);
+
+  // Read full test data via existing, nested JSON lines reader
+  CUDF_EXPECT_NO_THROW(cudf::io::read_json(cjson_lines_options));
 }