Skip to content

Commit

Permalink
Merge pull request #4281 from vespa-engine/glebashnik/unbonded-string…
Browse files Browse the repository at this point in the history
…-labels-feed-test

Performance test for feeding tensors with millions of string labels
  • Loading branch information
bjorncs authored Nov 19, 2024
2 parents 54d831f + 0af96df commit 1463145
Show file tree
Hide file tree
Showing 4 changed files with 174 additions and 2 deletions.
6 changes: 4 additions & 2 deletions lib/performance_test.rb
Original file line number Diff line number Diff line change
Expand Up @@ -169,8 +169,10 @@ def fill_feeder_json(json)

def post_process_feed_output(output, client, custom_fillers)
if client == :vespa_feed_client
json = JSON.parse(output)
fillers = [fill_feeder_json(json)]
json = '[' + output.gsub('}{', '},{') + ']'
feed_outputs = JSON.parse(json)
last_feed_output = feed_outputs[-1]
fillers = [fill_feeder_json(last_feed_output)]
else
lines = output.split("\n")[-1]
res = lines.gsub(/\s+/, "").split(",")
Expand Down
101 changes: 101 additions & 0 deletions tests/performance/tensor_unbound_string_labels_feed/data_generator.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
// Copyright Vespa.ai. All rights reserved.

#include <algorithm>
#include <iostream>
#include <numeric>
#include <string>
#include <vector>
#include <random>
#include <ctime>

// To compile on rhel6
// g++ -Wl,-rpath,$vespa_home/lib64/ -Wall -g -O3 -o data_generator data_generator.cpp

std::ostream &
generate_tensor(std::ostream &os, int doc_id, int tensor_size)
{
os << "\"cells\":{";
for (int i = 0; i < tensor_size; ++i) {
if (i != 0) {
os << ",";
}

const int value = std::rand();
// Adding random part to avoid cache hits
os << "\"" << "doc_" << doc_id << "_label_" << i << "_value_" << value << "\":" << value;
}
return os << "}";
}

void
generate_put(std::ostream &os, int doc_id, int tensor_size)
{
os << "{" << "\"put\":\"id:test:test::" << doc_id << "\",\"fields\":{" << std::endl;
os << "\"tensor\"" << ":{"; generate_tensor(os, doc_id, tensor_size) << "}";
os << "}}";
}

std::vector<int>
generate_doc_ids(int num_docs, bool shuffle)
{
std::vector<int> result(num_docs);
std::iota(result.begin(), result.end(), 0);

if (shuffle) {
std::mt19937 rng(static_cast<unsigned>(std::time(nullptr)));
std::shuffle(result.begin(), result.end(), rng);
}

return result;
}

void
generate_puts(std::ostream &os, int num_docs, int tensor_size)
{
auto doc_ids = generate_doc_ids(num_docs, true);

os << "[" << std::endl;
bool first = true;

for (int doc_id : doc_ids) {
if (!first) {
os << "," << std::endl;
}
generate_put(os, doc_id, tensor_size);
first = false;
}

os << std::endl << "]" << std::endl;
}

void
usage(char *argv[])
{
std::cerr << argv[0] << " <num-docs> <tensor-size>" << std::endl;
}

bool
verify_usage(int argc, char *argv[])
{
if (argc != 3) {
usage(argv);
return false;
}
return true;
}

int
main(int argc, char *argv[])
{
if (!verify_usage(argc, argv)) {
return 1;
}

int num_docs = std::stoi(argv[1]);
int tensor_size = std::stoi(argv[2]);

generate_puts(std::cout, num_docs, tensor_size);

return 0;
}

Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
# coding: utf-8
# Copyright Vespa.ai. All rights reserved.
require 'performance_test'
require 'app_generator/search_app'
require 'environment'

class TensorUnboundStringLabelsTest < PerformanceTest
def initialize(*args)
super(*args)
end

def setup
super
set_owner('glebashnik')
end

def create_app
SearchApp.new.sd(selfdir + 'test.sd').disable_flush_tuning.
container(Container.new.search(Searching.new).
documentapi(ContainerDocumentApi.new).
jvmoptions('-Xms8g -Xmx8g -agentlib:jdwp=transport=dt_socket,server=y,suspend=n,address=*:5005'))
end

def test_feed
set_description('Test feed performance for unbounded string labels in mapped tensors')
deploy_app(create_app)
start
compile_data_generator
feed_and_profile(30_000, 1000)
end

def feeder_numthreads
5
end

def compile_data_generator
tmp_bin_dir = vespa.adminserver.create_tmp_bin_dir
@data_generator = "#{tmp_bin_dir}/data_generator"
# TODO: make this work on centos7 as well
vespa.adminserver.execute("g++ -Wl,-rpath,#{Environment.instance.vespa_home}/lib64/ -Wall -g -O3 -o #{@data_generator} #{selfdir}/data_generator.cpp")
end

def feed_and_profile(num_docs, tensor_size)
container = (vespa.qrserver['0'] or vespa.container.values.first)

profiler_start
run_stream_feeder(
"#{@data_generator} #{num_docs} #{tensor_size}",
[
parameter_filler('legend', 'test_container_feed_performance'),
metric_filler('memory.rss', container.memusage_rss(container.get_pid))
],
{ :client => :vespa_feed_client }
)
profiler_report("d#{num_docs}-t#{tensor_size}")
end

def teardown
super
end
end
8 changes: 8 additions & 0 deletions tests/performance/tensor_unbound_string_labels_feed/test.sd
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
# Copyright Vespa.ai. All rights reserved.
schema test {
document test {
field tensor type tensor(label{}) {
indexing: attribute | summary
}
}
}

0 comments on commit 1463145

Please sign in to comment.