From 58112718c583967ab0a63032126a400ce53bf136 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 6 Sep 2024 10:29:36 -0700 Subject: [PATCH 01/13] support for parentJoins in benchmarks --- src/main/knn/KnnGraphTester.java | 124 ++++++++++++++++---- src/main/knn/KnnIndexer.java | 127 +++++++++++++++++---- src/main/knn/KnnIndexerMain.java | 13 ++- src/main/knn/ParentJoinBenchmarkQuery.java | 109 ++++++++++++++++++ src/python/knnPerfTest.py | 23 +++- 5 files changed, 346 insertions(+), 50 deletions(-) create mode 100644 src/main/knn/ParentJoinBenchmarkQuery.java diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index e2e26ec5f..e46700706 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -65,11 +65,15 @@ import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.QueryTimeout; import org.apache.lucene.index.StoredFields; +import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.FilteredDocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnByteVectorQuery; import org.apache.lucene.search.KnnFloatVectorQuery; @@ -79,12 +83,20 @@ import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; +import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; import org.apache.lucene.search.Weight; +import org.apache.lucene.search.join.BitSetProducer; +import org.apache.lucene.search.join.CheckJoinIndex; +import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; +import org.apache.lucene.search.join.QueryBitSetProducer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BitSetIterator; +import org.apache.lucene.util.Bits; +import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.NamedThreadFactory; import org.apache.lucene.util.PrintStreamInfoStream; @@ -109,6 +121,9 @@ public class KnnGraphTester { public static final String KNN_FIELD = "knn"; public static final String ID_FIELD = "id"; private static final String INDEX_DIR = "knnIndices"; + public static final String DOCTYPE_FIELD = "docType"; + public static final String WIKI_ID_FIELD = "wikiID"; + public static final String WIKI_PARA_ID_FIELD = "wikiParaID"; private int numDocs; private int dim; @@ -134,11 +149,13 @@ public class KnnGraphTester { private float selectivity; private boolean prefilter; private boolean randomCommits; + private boolean parentJoin = false; + private Path parentJoinMetaFile; private KnnGraphTester() { // set defaults numDocs = 1000; - numIters = 1000; + numIters = 100; dim = 256; topK = 100; numMergeThread = 1; @@ -205,30 +222,35 @@ private void run(String... args) throws Exception { throw new IllegalArgumentException("-beamWidthIndex requires a following number"); } beamWidth = Integer.parseInt(args[++iarg]); + log("beamWidth = %d", beamWidth); break; case "-maxConn": if (iarg == args.length - 1) { throw new IllegalArgumentException("-maxConn requires a following number"); } maxConn = Integer.parseInt(args[++iarg]); + log("maxConn = %d", maxConn); break; case "-dim": if (iarg == args.length - 1) { throw new IllegalArgumentException("-dim requires a following number"); } dim = Integer.parseInt(args[++iarg]); + log("Vector Dimensions: %d", dim); break; case "-ndoc": if (iarg == args.length - 1) { throw new IllegalArgumentException("-ndoc requires a following number"); } numDocs = Integer.parseInt(args[++iarg]); + log("numDocs = %d", numDocs); break; case "-niter": if (iarg == args.length - 1) { throw new IllegalArgumentException("-niter requires a following number"); } numIters = Integer.parseInt(args[++iarg]); + log("numIters = %d", numIters); break; case "-reindex": reindex = true; @@ -294,6 +316,7 @@ private void run(String... args) throws Exception { default: throw new IllegalArgumentException("-metric can be 'angular', 'euclidean', 'cosine', or 'mip' only"); } + log("similarity = %s", similarityFunction); break; case "-forceMerge": forceMerge = true; @@ -329,6 +352,13 @@ private void run(String... args) throws Exception { throw new IllegalArgumentException("-numMergeThread should be >= 1"); } break; + case "-parentJoin": + if (iarg == args.length - 1) { + throw new IllegalArgumentException("-parentJoin requires a following Path for parentJoinMetaFile"); + } + parentJoinMetaFile = Paths.get(args[++iarg]); + parentJoin = true; + break; default: throw new IllegalArgumentException("unknown argument " + arg); // usage(); @@ -342,6 +372,11 @@ private void run(String... args) throws Exception { } if (indexPath == null) { indexPath = Paths.get(formatIndexPath(docVectorsPath)); // derive index path + log("Index Path = %s", indexPath); + } + if (parentJoin && !reindex && !isParentJoinIndex(indexPath)) { + throw new IllegalArgumentException("Provided index: [" + indexPath + "] does not have parent-child " + + "document relationships. Rerun with -reindex or without -parentJoin argument"); } if (reindex) { if (docVectorsPath == null) { @@ -356,7 +391,9 @@ private void run(String... args) throws Exception { similarityFunction, numDocs, 0, - quiet + quiet, + parentJoin, + parentJoinMetaFile ).createIndex(); System.out.println("reindex takes " + reindexTimeMsec + " ms"); } @@ -386,11 +423,23 @@ private void run(String... args) throws Exception { } private String formatIndexPath(Path docsPath) { + List suffix = new ArrayList<>(); + suffix.add(Integer.toString(maxConn)); + suffix.add(Integer.toString(beamWidth)); if (quantize) { - return INDEX_DIR + "/" + docsPath.getFileName() + "-" + maxConn + "-" + beamWidth + "-" - + quantizeBits + (quantizeCompress ? "-compressed" : "" ) + ".index"; + suffix.add(Integer.toString(quantizeBits)); + if (quantizeCompress == true) { + suffix.add("-compressed"); + } + } + if (parentJoin) { + suffix.add("parentJoin"); } - return INDEX_DIR + "/" + docsPath.getFileName() + "-" + maxConn + "-" + beamWidth + ".index"; + return INDEX_DIR + "/" + docsPath.getFileName() + "-" + String.join("-", suffix) + ".index"; + } + + private boolean isParentJoinIndex(Path indexPath) { + return indexPath.toString().contains("parentJoin"); } @SuppressForbidden(reason = "Prints stuff") @@ -525,9 +574,7 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] if (targetReader instanceof VectorReaderByte b) { targetReaderByte = b; } - if (quiet == false) { - System.out.println("running " + numIters + " targets; topK=" + topK + ", fanout=" + fanout); - } + log("running " + numIters + " targets; topK=" + topK + ", fanout=" + fanout); long start; ThreadMXBean bean = ManagementFactory.getThreadMXBean(); long cpuTimeStartNs; @@ -538,6 +585,7 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] Query bitSetQuery = prefilter ? new BitSetQuery(matchDocs) : null; for (int i = 0; i < numIters; i++) { // warm up + log("\t...warm up for query #%d", i); if (vectorEncoding.equals(VectorEncoding.BYTE)) { byte[] target = targetReaderByte.nextBytes(); if (prefilter) { @@ -548,9 +596,9 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] } else { float[] target = targetReader.next(); if (prefilter) { - doKnnVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery); + doKnnVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery, parentJoin); } else { - doKnnVectorQuery(searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null); + doKnnVectorQuery(searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null, parentJoin); } } } @@ -558,6 +606,7 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] start = System.nanoTime(); cpuTimeStartNs = bean.getCurrentThreadCpuTime(); for (int i = 0; i < numIters; i++) { + log("\t...running search for query #%d", i); if (vectorEncoding.equals(VectorEncoding.BYTE)) { byte[] target = targetReaderByte.nextBytes(); if (prefilter) { @@ -568,11 +617,11 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] } else { float[] target = targetReader.next(); if (prefilter) { - results[i] = doKnnVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery); + results[i] = doKnnVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery, parentJoin); } else { results[i] = doKnnVectorQuery( - searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null); + searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null, parentJoin); } } if (prefilter == false && matchDocs != null) { @@ -666,8 +715,13 @@ private static TopDocs doKnnByteVectorQuery( } private static TopDocs doKnnVectorQuery( - IndexSearcher searcher, String field, float[] vector, int k, int fanout, Query filter) + IndexSearcher searcher, String field, float[] vector, int k, int fanout, Query filter, boolean isParentJoinQuery) throws IOException { + if (isParentJoinQuery) { + System.out.println("\trunning ParentJoin knnVectorQuery using approx. search"); + ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(searcher.getIndexReader(), KNN_FIELD, DOCTYPE_FIELD, vector, k); + return searcher.search(parentJoinQuery, k); + } ProfiledKnnFloatVectorQuery profiledQuery = new ProfiledKnnFloatVectorQuery(field, vector, k, fanout, filter); TopDocs docs = searcher.search(profiledQuery, k); return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation), docs.scoreDocs); @@ -698,9 +752,14 @@ private int compareNN(int[] expected, TopDocs results) { return matched; } + /** Returns the topK nearest neighbors for each target query. + * + * The method runs "numIters" target queries and returns "topK" nearest neighbors + * for each of them. Nearest Neighbors are computed using exact match. + */ private int[][] getNN(Path docPath, Path queryPath) throws IOException { // look in working directory for cached nn file - String hash = Integer.toString(Objects.hash(docPath, queryPath, numDocs, numIters, topK, similarityFunction.ordinal()), 36); + String hash = Integer.toString(Objects.hash(docPath, queryPath, numDocs, numIters, topK, similarityFunction.ordinal(), parentJoin), 36); String nnFileName = "nn-" + hash + ".bin"; Path nnPath = Paths.get(nnFileName); if (Files.exists(nnPath) && isNewer(nnPath, docPath, queryPath) && selectivity == 1f) { @@ -837,9 +896,7 @@ public Void call() { private int[][] computeNN(Path docPath, Path queryPath) throws IOException { int[][] result = new int[numIters][]; - if (quiet == false) { - System.out.println("computing true nearest neighbors of " + numIters + " target vectors"); - } + log("computing true nearest neighbors of " + numIters + " target vectors"); List tasks = new ArrayList<>(); try (FileChannel qIn = FileChannel.open(queryPath)) { VectorReader queryReader = (VectorReader) VectorReader.create(qIn, dim, VectorEncoding.FLOAT32); @@ -868,6 +925,21 @@ class ComputeNNFloatTask implements Callable { @Override public Void call() { + if (parentJoin) { + // Use DiversifyingChildrenFloatKnnVectorQuery for parentJoins + try (Directory dir = FSDirectory.open(indexPath); + DirectoryReader reader = DirectoryReader.open(dir)) { + ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(reader, KNN_FIELD, DOCTYPE_FIELD, query, topK); + TopDocs topHits = parentJoinQuery.runExactSearch(); + result[queryOrd] = new int[topK]; + int k = 0; + for (ScoreDoc scoreDoc : topHits.scoreDocs) { + result[queryOrd][k++] = scoreDoc.doc; + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } else { NeighborQueue queue = new NeighborQueue(topK, false); try (FileChannel in = FileChannel.open(docPath)) { VectorReader docReader = (VectorReader) VectorReader.create(in, dim, VectorEncoding.FLOAT32); @@ -884,14 +956,24 @@ public Void call() { result[queryOrd][k] = queue.topNode(); queue.pop(); } - if (quiet == false && (queryOrd + 1) % 10 == 0) { - System.out.print(" " + (queryOrd + 1)); - System.out.flush(); + if ((queryOrd + 1) % 10 == 0) { + log(" " + (queryOrd + 1)); } } catch (IOException e) { throw new RuntimeException(e); } - return null; + } + if ((queryOrd + 1) % 10 == 0) { + log("(parentJoin=%s) top-%d results for iteration %d: %s", parentJoin, topK, queryOrd, Arrays.toString(result[queryOrd])); + } + return null; + } + } + + private void log(String msg, Object... args) { + if (quiet == false) { + System.out.printf((msg) + "%n", args); + System.out.flush(); } } diff --git a/src/main/knn/KnnIndexer.java b/src/main/knn/KnnIndexer.java index 02744df5f..7f47762e2 100644 --- a/src/main/knn/KnnIndexer.java +++ b/src/main/knn/KnnIndexer.java @@ -28,10 +28,15 @@ import org.apache.lucene.store.FSDirectory; import org.apache.lucene.util.PrintStreamInfoStream; +import java.io.BufferedReader; import java.io.IOException; import java.nio.channels.FileChannel; +import java.nio.file.Files; import java.nio.file.Path; +import java.util.ArrayList; +import java.util.List; import java.util.concurrent.TimeUnit; +import java.util.stream.Collectors; public class KnnIndexer { // use smaller ram buffer so we get to merging sooner, making better use of @@ -48,9 +53,12 @@ public class KnnIndexer { int numDocs; int docsStartIndex; boolean quiet; + boolean parentJoin; + Path parentJoinMetaPath; public KnnIndexer(Path docsPath, Path indexPath, Codec codec, VectorEncoding vectorEncoding, int dim, - VectorSimilarityFunction similarityFunction, int numDocs, int docsStartIndex, boolean quiet) { + VectorSimilarityFunction similarityFunction, int numDocs, int docsStartIndex, boolean quiet, + boolean parentJoin, Path parentJoinMetaPath) { this.docsPath = docsPath; this.indexPath = indexPath; this.codec = codec; @@ -60,6 +68,8 @@ public KnnIndexer(Path docsPath, Path indexPath, Codec codec, VectorEncoding vec this.numDocs = numDocs; this.docsStartIndex = docsStartIndex; this.quiet = quiet; + this.parentJoin = parentJoin; + this.parentJoinMetaPath = parentJoinMetaPath; } public int createIndex() throws IOException { @@ -76,7 +86,7 @@ public int createIndex() throws IOException { case FLOAT32 -> KnnFloatVectorField.createFieldType(dim, similarityFunction); }; if (quiet == false) { - iwc.setInfoStream(new PrintStreamInfoStream(System.out)); +// iwc.setInfoStream(new PrintStreamInfoStream(System.out)); System.out.println("creating index in " + indexPath); } @@ -92,32 +102,101 @@ public int createIndex() throws IOException { seekToStartDoc(in, dim, vectorEncoding, docsStartIndex); } VectorReader vectorReader = VectorReader.create(in, dim, vectorEncoding); - for (int i = 0; i < numDocs; i++) { - Document doc = new Document(); - switch (vectorEncoding) { - case BYTE -> doc.add( - new KnnByteVectorField( - KnnGraphTester.KNN_FIELD, ((VectorReaderByte) vectorReader).nextBytes(), fieldType)); - case FLOAT32 -> doc.add( - new KnnFloatVectorField(KnnGraphTester.KNN_FIELD, vectorReader.next(), fieldType)); + if (parentJoin == false) { + for (int i = 0; i < numDocs; i++) { + Document doc = new Document(); + switch (vectorEncoding) { + case BYTE -> doc.add( + new KnnByteVectorField( + KnnGraphTester.KNN_FIELD, ((VectorReaderByte) vectorReader).nextBytes(), fieldType)); + case FLOAT32 -> doc.add( + new KnnFloatVectorField(KnnGraphTester.KNN_FIELD, vectorReader.next(), fieldType)); + } + doc.add(new StoredField(KnnGraphTester.ID_FIELD, i)); + iw.addDocument(doc); + + if ((i + 1) % 25000 == 0) { + System.out.println("Done indexing " + (i + 1) + " documents."); + } } - doc.add(new StoredField(KnnGraphTester.ID_FIELD, i)); - iw.addDocument(doc); + } else { + // create parent-block join documents + try (BufferedReader br = Files.newBufferedReader(parentJoinMetaPath)) { + String[] headers = br.readLine().trim().split(","); + if (headers.length != 2) { + throw new IllegalStateException("Expected two columns in parentJoinMetadata csv. Found: " + headers.length); + } + log("Parent join metaFile columns: %s | %s", headers[0], headers[1]); + int childDocs = 0; + int parentDocs = 0; + int docIds = 0; + String prevWikiId = "null"; + String currWikiId; + List block = new ArrayList<>(); + do { + String[] line = br.readLine().trim().split(","); + currWikiId = line[0]; + String currParaId = line[1]; + Document doc = new Document(); + switch (vectorEncoding) { + case BYTE -> doc.add( + new KnnByteVectorField( + KnnGraphTester.KNN_FIELD, ((VectorReaderByte) vectorReader).nextBytes(), fieldType)); + case FLOAT32 -> doc.add( + new KnnFloatVectorField(KnnGraphTester.KNN_FIELD, vectorReader.next(), fieldType)); + } + doc.add(new StoredField(KnnGraphTester.ID_FIELD, docIds++)); + doc.add(new StringField(KnnGraphTester.WIKI_ID_FIELD, currWikiId, Field.Store.YES)); + doc.add(new StringField(KnnGraphTester.WIKI_PARA_ID_FIELD, currParaId, Field.Store.YES)); + doc.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, "_child", Field.Store.NO)); + childDocs++; - if ((i+1) % 25000 == 0) { - System.out.println("Done indexing " + (i + 1) + " documents."); + // Close block and create a new one when wiki article changes. + if (!currWikiId.equals(prevWikiId) && !"null".equals(prevWikiId)) { + Document parent = new Document(); + parent.add(new StoredField(KnnGraphTester.ID_FIELD, docIds++)); + parent.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, "_parent", Field.Store.NO)); + parent.add(new StringField(KnnGraphTester.WIKI_ID_FIELD, prevWikiId, Field.Store.YES)); + parent.add(new StringField(KnnGraphTester.WIKI_PARA_ID_FIELD, "_", Field.Store.YES)); + block.add(parent); + iw.addDocuments(block); + parentDocs++; + if (parentDocs % 1000 == 0) { + String subDocs = block.stream().map(d -> d.get(KnnGraphTester.WIKI_PARA_ID_FIELD)).collect(Collectors.joining("-")); + log("parentDocId = %s, numSubDocs = %d, subDocs = %s", currWikiId, block.size() - 1, subDocs); + } + // create new block for the next article + block = new ArrayList<>(); + block.add(doc); + } else { + block.add(doc); + } + prevWikiId = currWikiId; + if (childDocs % 25000 == 0) { + log("indexed %d child documents, with %d parents", childDocs, parentDocs); + } + } while (childDocs < numDocs); + if (!block.isEmpty()) { + Document parent = new Document(); + parent.add(new StoredField(KnnGraphTester.ID_FIELD, docIds++)); + parent.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, "_parent", Field.Store.NO)); + parent.add(new StringField(KnnGraphTester.WIKI_ID_FIELD, prevWikiId, Field.Store.YES)); + parent.add(new StringField(KnnGraphTester.WIKI_PARA_ID_FIELD, "_", Field.Store.YES)); + block.add(parent); + iw.addDocuments(block); + } + log("Indexed %d documents with %d parent docs. now flush", childDocs, parentDocs); } } - if (quiet == false) { - System.out.println("Done indexing " + numDocs + " documents; now flush"); - } +// iw.flush(); } } long elapsed = System.nanoTime() - start; - if (quiet == false) { - System.out.println( - "Indexed " + numDocs + " documents in " + TimeUnit.NANOSECONDS.toSeconds(elapsed) + "s"); - } + log("Indexed %d docs in %d seconds", numDocs, TimeUnit.NANOSECONDS.toSeconds(elapsed)); +// if (quiet == false) { +// System.out.println( +// "Indexed " + numDocs + " documents in " + TimeUnit.NANOSECONDS.toSeconds(elapsed) + "s"); +// } return (int) TimeUnit.NANOSECONDS.toMillis(elapsed); } @@ -125,4 +204,10 @@ private void seekToStartDoc(FileChannel in, int dim, VectorEncoding vectorEncodi int startByte = docsStartIndex * dim * vectorEncoding.byteSize; in.position(startByte); } + + private void log(String msg, Object... args) { + if (quiet == false) { + System.out.printf((msg) + "%n", args); + } + } } diff --git a/src/main/knn/KnnIndexerMain.java b/src/main/knn/KnnIndexerMain.java index 66dc01ce6..3e136b744 100644 --- a/src/main/knn/KnnIndexerMain.java +++ b/src/main/knn/KnnIndexerMain.java @@ -19,6 +19,7 @@ import java.io.IOException; import java.nio.file.Path; +import java.nio.file.Paths; import java.util.concurrent.ExecutorService; import java.util.concurrent.Executors; @@ -38,6 +39,8 @@ public class KnnIndexerMain { public int docStartIndex = 0; boolean quiet = false; + boolean parentJoin = false; + Path parentJoinMetaFile = null; @Override public String toString() { @@ -73,6 +76,10 @@ public static void main(String[] args) throws IOException { case "-docstartindex" -> inputs.docStartIndex = Integer.parseInt(args[++i]); case "-dimension" -> inputs.dimension = Integer.parseInt(args[++i]); case "-quiet" -> inputs.quiet = true; + case "-parentjoin" -> { + inputs.parentJoin = true; + inputs.parentJoinMetaFile = Paths.get(args[++i]); + } default -> throw new IllegalArgumentException("Cannot recognize the option " + args[i]); } i++; @@ -96,7 +103,8 @@ public static void main(String[] args) throws IOException { new KnnIndexer(inputs.docVectorsPath, inputs.indexPath, KnnGraphTester.getCodec(inputs.maxConn, inputs.beamWidth, exec, numMergeWorker, quantize, quantizeBits, quantizeCompress), inputs.vectorEncoding, - inputs.dimension, inputs.similarityFunction, inputs.numDocs, inputs.docStartIndex, inputs.quiet).createIndex(); + inputs.dimension, inputs.similarityFunction, inputs.numDocs, inputs.docStartIndex, inputs.quiet, + inputs.parentJoin, inputs.parentJoinMetaFile).createIndex(); if (!inputs.quiet) { System.out.println("Successfully created index."); @@ -114,6 +122,7 @@ public String usage() { "\t -similarityFunction : similarity function for vector comparison. One of ( EUCLIDEAN, DOT_PRODUCT, COSINE, MAXIMUM_INNER_PRODUCT )\n" + "\t -numDocs : number of document vectors to be used from the file\n" + "\t -docStartIndex : Start index of first document vector. This can be helpful when we want to run different with set of documents from within the same file.\n" + - "\t -quiet : don't print anything on console if mentioned.\n"; + "\t -quiet : don't print anything on console if mentioned.\n" + + "\t -parentJoin : create parentJoin index. Requires '*-metadata.csv'\n"; } } diff --git a/src/main/knn/ParentJoinBenchmarkQuery.java b/src/main/knn/ParentJoinBenchmarkQuery.java new file mode 100644 index 000000000..6ee682629 --- /dev/null +++ b/src/main/knn/ParentJoinBenchmarkQuery.java @@ -0,0 +1,109 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package knn; + +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.LeafReader; +import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.QueryTimeout; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.DocIdSet; +import org.apache.lucene.search.DocIdSetIterator; +import org.apache.lucene.search.FilteredDocIdSetIterator; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.Weight; +import org.apache.lucene.search.join.BitSetProducer; +import org.apache.lucene.search.join.CheckJoinIndex; +import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; +import org.apache.lucene.search.join.QueryBitSetProducer; +import org.apache.lucene.util.Bits; + +import java.io.IOException; +import java.util.List; + +public class ParentJoinBenchmarkQuery extends DiversifyingChildrenFloatKnnVectorQuery { + + IndexReader reader; + int topK; + + static ParentJoinBenchmarkQuery create(IndexReader reader, String knnField, String parentField, float[] queryVector, int topK) throws IOException { + BitSetProducer parentsFilter = + new QueryBitSetProducer(new TermQuery(new Term(parentField, "_parent"))); + CheckJoinIndex.check(reader, parentsFilter); + System.out.println("Index is parentJoin eligible..."); + System.out.println("Creating ParentJoinQuery. Reader statistics:"); + System.out.flush(); + return new ParentJoinBenchmarkQuery(reader, knnField, queryVector, null, topK, parentsFilter); + } + + ParentJoinBenchmarkQuery(IndexReader reader, String field, float[] query, Query childFilter, int k, BitSetProducer parentsFilter) throws IOException { + super(field, query, childFilter, k, parentsFilter); + this.reader = reader; + this.topK = k; + } + + // expose for benchmarking + @Override + public TopDocs exactSearch(LeafReaderContext context, DocIdSetIterator acceptIterator, QueryTimeout queryTimeout) throws IOException { + return super.exactSearch(context, acceptIterator, queryTimeout); + } + + public TopDocs runExactSearch() throws IOException { + IndexSearcher searcher = new IndexSearcher(reader); + List leafReaderContexts = reader.leaves(); + TopDocs[] perLeafResults = new TopDocs[leafReaderContexts.size()]; + int leaf = 0; + System.out.println("Num leaves in index: " + leafReaderContexts.size()); + for (LeafReaderContext ctx : leafReaderContexts) { + final LeafReader r = ctx.reader(); + TermQuery children = new TermQuery(new Term("docType", "_child")); + Weight childrenWeight = children.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f); + DocIdSetIterator acceptDocs = childrenWeight.scorer(ctx).iterator(); +// Bits liveDocs = r.getLiveDocs(); +// FilteredDocIdSetIterator acceptDocs = +// new FilteredDocIdSetIterator(DocIdSetIterator.all(r.maxDoc())) { +// @Override +// protected boolean match(int doc) { +// return liveDocs == null || liveDocs.get(doc); +// } +// }; + System.out.println("Running exactSearch for leaf: " + leaf); + System.out.println("Leaf maxdoc: " + r.maxDoc()); + System.out.println("Leaf numDocs: " + r.numDocs()); + System.out.println("Accept Docs with childrenWeight DISI cost: " + acceptDocs.cost()); + System.out.flush(); + perLeafResults[leaf] = exactSearch(ctx, acceptDocs, null); + if (ctx.docBase > 0) { + for (ScoreDoc scoreDoc : perLeafResults[leaf].scoreDocs) { + scoreDoc.doc += ctx.docBase; + } + } + leaf++; + } + return super.mergeLeafResults(perLeafResults); + } + + private static void log(String msg, Object... args) { + System.out.printf((msg) + "%n", args); + } +} diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py index ab31a53e6..82a8f3385 100644 --- a/src/python/knnPerfTest.py +++ b/src/python/knnPerfTest.py @@ -6,6 +6,7 @@ import constants import re from common import getLuceneDirFromGradleProperties +import localconstants # Measure vector search recall and latency while exploring hyperparameters @@ -27,6 +28,10 @@ # Where the version of Lucene is that will be tested. Now this will be sourced from gradle.properties LUCENE_CHECKOUT = getLuceneDirFromGradleProperties() +# Where the version of Lucene is that will be tested. Expected to be in the base dir above luceneutil. +# LUCENE_CHECKOUT = 'baseline' +# LUCENE_CHECKOUT = 'candidate' +# LUCENE_CHECKOUT = 'trunk' # e.g. to compile KnnIndexer: # @@ -38,7 +43,7 @@ #'ndoc': (10000, 100000, 1000000), #'ndoc': (10000, 100000, 200000, 500000), #'ndoc': (10000, 100000, 200000, 500000), - 'ndoc': (250_000,), + 'ndoc': (50_000,), #'ndoc': (100000,), #'maxConn': (32, 64, 96), #'maxConn': (64, ), @@ -49,7 +54,7 @@ #'fanout': (20, 100, 250) 'fanout': (20,), #'quantize': None, - 'quantizeBits': (4, 7, 8), + # 'quantizeBits': (4, 7, 8), 'numMergeWorker': (12,), 'numMergeThread': (4,), 'encoding': ('float32',), @@ -91,11 +96,15 @@ def run_knn_benchmark(checkout, values): #dim = 256 #doc_vectors = '/d/electronics_asin_emb.bin' #query_vectors = '/d/electronics_query_vectors.bin' + dim = 768 + doc_vectors = f"{localconstants.BASE_DIR}/data/{'cohere-wikipedia'}-docs-{dim}d.vec" + query_vectors = f"{localconstants.BASE_DIR}/data/{'cohere-wikipedia'}-queries-{dim}d.vec" + parentJoin_meta_file = f"{localconstants.BASE_DIR}/data/{'cohere-wikipedia'}-metadata.csv" # Cohere dataset - dim = 768 - doc_vectors = '%s/data/cohere-wikipedia-768.vec' % constants.BASE_DIR - query_vectors = '%s/data/cohere-wikipedia-queries-768.vec' % constants.BASE_DIR +# dim = 768 +# doc_vectors = '%s/data/cohere-wikipedia-768.vec' % constants.BASE_DIR +# query_vectors = '%s/data/cohere-wikipedia-queries-768.vec' % constants.BASE_DIR cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout)) cmd = constants.JAVA_EXE.split(' ') + ['-cp', cp, '--add-modules', 'jdk.incubator.vector', @@ -130,9 +139,11 @@ def run_knn_benchmark(checkout, values): '-reindex', '-search', query_vectors, #'-metric', 'euclidean', + '-parentJoin', parentJoin_meta_file, # '-numMergeThread', '8', '-numMergeWorker', '8', # '-forceMerge', - '-quiet'] + # '-quiet' + ] print(f' cmd: {this_cmd}') job = subprocess.Popen(this_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='utf-8') re_summary = re.compile(r'^SUMMARY: (.*?)$', re.MULTILINE) From 8b57eeeb8112accb674e936bfdccfce0280b44de Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 6 Sep 2024 10:45:38 -0700 Subject: [PATCH 02/13] clean up debug log lines --- src/main/knn/KnnGraphTester.java | 9 +++------ src/main/knn/KnnIndexer.java | 4 ---- src/main/knn/ParentJoinBenchmarkQuery.java | 21 --------------------- 3 files changed, 3 insertions(+), 31 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index e46700706..5ff45cbaa 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -585,7 +585,6 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] Query bitSetQuery = prefilter ? new BitSetQuery(matchDocs) : null; for (int i = 0; i < numIters; i++) { // warm up - log("\t...warm up for query #%d", i); if (vectorEncoding.equals(VectorEncoding.BYTE)) { byte[] target = targetReaderByte.nextBytes(); if (prefilter) { @@ -606,7 +605,6 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] start = System.nanoTime(); cpuTimeStartNs = bean.getCurrentThreadCpuTime(); for (int i = 0; i < numIters; i++) { - log("\t...running search for query #%d", i); if (vectorEncoding.equals(VectorEncoding.BYTE)) { byte[] target = targetReaderByte.nextBytes(); if (prefilter) { @@ -718,7 +716,6 @@ private static TopDocs doKnnVectorQuery( IndexSearcher searcher, String field, float[] vector, int k, int fanout, Query filter, boolean isParentJoinQuery) throws IOException { if (isParentJoinQuery) { - System.out.println("\trunning ParentJoin knnVectorQuery using approx. search"); ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(searcher.getIndexReader(), KNN_FIELD, DOCTYPE_FIELD, vector, k); return searcher.search(parentJoinQuery, k); } @@ -757,7 +754,7 @@ private int compareNN(int[] expected, TopDocs results) { * The method runs "numIters" target queries and returns "topK" nearest neighbors * for each of them. Nearest Neighbors are computed using exact match. */ - private int[][] getNN(Path docPath, Path queryPath) throws IOException { + private int[][] getNN(Path docPath, Path queryPath) throws IOException, InterruptedException { // look in working directory for cached nn file String hash = Integer.toString(Objects.hash(docPath, queryPath, numDocs, numIters, topK, similarityFunction.ordinal(), parentJoin), 36); String nnFileName = "nn-" + hash + ".bin"; @@ -832,7 +829,7 @@ private static FixedBitSet generateRandomBitSet(int size, float selectivity) { return bitSet; } - private int[][] computeNNByte(Path docPath, Path queryPath) throws IOException { + private int[][] computeNNByte(Path docPath, Path queryPath) throws IOException, InterruptedException { int[][] result = new int[numIters][]; if (quiet == false) { System.out.println("computing true nearest neighbors of " + numIters + " target vectors"); @@ -894,7 +891,7 @@ public Void call() { /** Brute force computation of "true" nearest neighhbors. */ private int[][] computeNN(Path docPath, Path queryPath) - throws IOException { + throws IOException, InterruptedException { int[][] result = new int[numIters][]; log("computing true nearest neighbors of " + numIters + " target vectors"); List tasks = new ArrayList<>(); diff --git a/src/main/knn/KnnIndexer.java b/src/main/knn/KnnIndexer.java index 7f47762e2..512487115 100644 --- a/src/main/knn/KnnIndexer.java +++ b/src/main/knn/KnnIndexer.java @@ -193,10 +193,6 @@ public int createIndex() throws IOException { } long elapsed = System.nanoTime() - start; log("Indexed %d docs in %d seconds", numDocs, TimeUnit.NANOSECONDS.toSeconds(elapsed)); -// if (quiet == false) { -// System.out.println( -// "Indexed " + numDocs + " documents in " + TimeUnit.NANOSECONDS.toSeconds(elapsed) + "s"); -// } return (int) TimeUnit.NANOSECONDS.toMillis(elapsed); } diff --git a/src/main/knn/ParentJoinBenchmarkQuery.java b/src/main/knn/ParentJoinBenchmarkQuery.java index 6ee682629..47abed9ca 100644 --- a/src/main/knn/ParentJoinBenchmarkQuery.java +++ b/src/main/knn/ParentJoinBenchmarkQuery.java @@ -50,9 +50,6 @@ static ParentJoinBenchmarkQuery create(IndexReader reader, String knnField, Stri BitSetProducer parentsFilter = new QueryBitSetProducer(new TermQuery(new Term(parentField, "_parent"))); CheckJoinIndex.check(reader, parentsFilter); - System.out.println("Index is parentJoin eligible..."); - System.out.println("Creating ParentJoinQuery. Reader statistics:"); - System.out.flush(); return new ParentJoinBenchmarkQuery(reader, knnField, queryVector, null, topK, parentsFilter); } @@ -73,25 +70,11 @@ public TopDocs runExactSearch() throws IOException { List leafReaderContexts = reader.leaves(); TopDocs[] perLeafResults = new TopDocs[leafReaderContexts.size()]; int leaf = 0; - System.out.println("Num leaves in index: " + leafReaderContexts.size()); for (LeafReaderContext ctx : leafReaderContexts) { final LeafReader r = ctx.reader(); TermQuery children = new TermQuery(new Term("docType", "_child")); Weight childrenWeight = children.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f); DocIdSetIterator acceptDocs = childrenWeight.scorer(ctx).iterator(); -// Bits liveDocs = r.getLiveDocs(); -// FilteredDocIdSetIterator acceptDocs = -// new FilteredDocIdSetIterator(DocIdSetIterator.all(r.maxDoc())) { -// @Override -// protected boolean match(int doc) { -// return liveDocs == null || liveDocs.get(doc); -// } -// }; - System.out.println("Running exactSearch for leaf: " + leaf); - System.out.println("Leaf maxdoc: " + r.maxDoc()); - System.out.println("Leaf numDocs: " + r.numDocs()); - System.out.println("Accept Docs with childrenWeight DISI cost: " + acceptDocs.cost()); - System.out.flush(); perLeafResults[leaf] = exactSearch(ctx, acceptDocs, null); if (ctx.docBase > 0) { for (ScoreDoc scoreDoc : perLeafResults[leaf].scoreDocs) { @@ -102,8 +85,4 @@ public TopDocs runExactSearch() throws IOException { } return super.mergeLeafResults(perLeafResults); } - - private static void log(String msg, Object... args) { - System.out.printf((msg) + "%n", args); - } } From 2c15f589e80c4840e556cee4fed7bd54c88910be Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 6 Sep 2024 10:53:23 -0700 Subject: [PATCH 03/13] use labels to constants --- src/main/knn/KnnGraphTester.java | 6 ++++-- src/main/knn/KnnIndexer.java | 9 ++++++--- src/main/knn/ParentJoinBenchmarkQuery.java | 15 ++++++++------- 3 files changed, 18 insertions(+), 12 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 5ff45cbaa..01c1d4239 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -122,6 +122,8 @@ public class KnnGraphTester { public static final String ID_FIELD = "id"; private static final String INDEX_DIR = "knnIndices"; public static final String DOCTYPE_FIELD = "docType"; + public static final String DOCTYPE_PARENT = "_parent"; + public static final String DOCTYPE_CHILD = "_child"; public static final String WIKI_ID_FIELD = "wikiID"; public static final String WIKI_PARA_ID_FIELD = "wikiParaID"; @@ -716,7 +718,7 @@ private static TopDocs doKnnVectorQuery( IndexSearcher searcher, String field, float[] vector, int k, int fanout, Query filter, boolean isParentJoinQuery) throws IOException { if (isParentJoinQuery) { - ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(searcher.getIndexReader(), KNN_FIELD, DOCTYPE_FIELD, vector, k); + ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(searcher.getIndexReader(), vector, k); return searcher.search(parentJoinQuery, k); } ProfiledKnnFloatVectorQuery profiledQuery = new ProfiledKnnFloatVectorQuery(field, vector, k, fanout, filter); @@ -926,7 +928,7 @@ public Void call() { // Use DiversifyingChildrenFloatKnnVectorQuery for parentJoins try (Directory dir = FSDirectory.open(indexPath); DirectoryReader reader = DirectoryReader.open(dir)) { - ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(reader, KNN_FIELD, DOCTYPE_FIELD, query, topK); + ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(reader, query, topK); TopDocs topHits = parentJoinQuery.runExactSearch(); result[queryOrd] = new int[topK]; int k = 0; diff --git a/src/main/knn/KnnIndexer.java b/src/main/knn/KnnIndexer.java index 512487115..61f768880 100644 --- a/src/main/knn/KnnIndexer.java +++ b/src/main/knn/KnnIndexer.java @@ -38,6 +38,9 @@ import java.util.concurrent.TimeUnit; import java.util.stream.Collectors; +import static knn.KnnGraphTester.DOCTYPE_CHILD; +import static knn.KnnGraphTester.DOCTYPE_PARENT; + public class KnnIndexer { // use smaller ram buffer so we get to merging sooner, making better use of // many cores (TODO: use multiple indexing threads): @@ -148,14 +151,14 @@ public int createIndex() throws IOException { doc.add(new StoredField(KnnGraphTester.ID_FIELD, docIds++)); doc.add(new StringField(KnnGraphTester.WIKI_ID_FIELD, currWikiId, Field.Store.YES)); doc.add(new StringField(KnnGraphTester.WIKI_PARA_ID_FIELD, currParaId, Field.Store.YES)); - doc.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, "_child", Field.Store.NO)); + doc.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, DOCTYPE_CHILD, Field.Store.NO)); childDocs++; // Close block and create a new one when wiki article changes. if (!currWikiId.equals(prevWikiId) && !"null".equals(prevWikiId)) { Document parent = new Document(); parent.add(new StoredField(KnnGraphTester.ID_FIELD, docIds++)); - parent.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, "_parent", Field.Store.NO)); + parent.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, DOCTYPE_PARENT, Field.Store.NO)); parent.add(new StringField(KnnGraphTester.WIKI_ID_FIELD, prevWikiId, Field.Store.YES)); parent.add(new StringField(KnnGraphTester.WIKI_PARA_ID_FIELD, "_", Field.Store.YES)); block.add(parent); @@ -179,7 +182,7 @@ public int createIndex() throws IOException { if (!block.isEmpty()) { Document parent = new Document(); parent.add(new StoredField(KnnGraphTester.ID_FIELD, docIds++)); - parent.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, "_parent", Field.Store.NO)); + parent.add(new StringField(KnnGraphTester.DOCTYPE_FIELD, DOCTYPE_PARENT, Field.Store.NO)); parent.add(new StringField(KnnGraphTester.WIKI_ID_FIELD, prevWikiId, Field.Store.YES)); parent.add(new StringField(KnnGraphTester.WIKI_PARA_ID_FIELD, "_", Field.Store.YES)); block.add(parent); diff --git a/src/main/knn/ParentJoinBenchmarkQuery.java b/src/main/knn/ParentJoinBenchmarkQuery.java index 47abed9ca..6039810f5 100644 --- a/src/main/knn/ParentJoinBenchmarkQuery.java +++ b/src/main/knn/ParentJoinBenchmarkQuery.java @@ -41,20 +41,22 @@ import java.io.IOException; import java.util.List; +import static knn.KnnGraphTester.*; + public class ParentJoinBenchmarkQuery extends DiversifyingChildrenFloatKnnVectorQuery { IndexReader reader; int topK; - static ParentJoinBenchmarkQuery create(IndexReader reader, String knnField, String parentField, float[] queryVector, int topK) throws IOException { + static ParentJoinBenchmarkQuery create(IndexReader reader, float[] queryVector, int topK) throws IOException { BitSetProducer parentsFilter = - new QueryBitSetProducer(new TermQuery(new Term(parentField, "_parent"))); + new QueryBitSetProducer(new TermQuery(new Term(DOCTYPE_FIELD, DOCTYPE_PARENT))); CheckJoinIndex.check(reader, parentsFilter); - return new ParentJoinBenchmarkQuery(reader, knnField, queryVector, null, topK, parentsFilter); + return new ParentJoinBenchmarkQuery(reader, queryVector, null, topK, parentsFilter); } - ParentJoinBenchmarkQuery(IndexReader reader, String field, float[] query, Query childFilter, int k, BitSetProducer parentsFilter) throws IOException { - super(field, query, childFilter, k, parentsFilter); + ParentJoinBenchmarkQuery(IndexReader reader, float[] query, Query childFilter, int k, BitSetProducer parentsFilter) throws IOException { + super(KNN_FIELD, query, childFilter, k, parentsFilter); this.reader = reader; this.topK = k; } @@ -71,8 +73,7 @@ public TopDocs runExactSearch() throws IOException { TopDocs[] perLeafResults = new TopDocs[leafReaderContexts.size()]; int leaf = 0; for (LeafReaderContext ctx : leafReaderContexts) { - final LeafReader r = ctx.reader(); - TermQuery children = new TermQuery(new Term("docType", "_child")); + TermQuery children = new TermQuery(new Term(DOCTYPE_FIELD, DOCTYPE_CHILD)); Weight childrenWeight = children.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f); DocIdSetIterator acceptDocs = childrenWeight.scorer(ctx).iterator(); perLeafResults[leaf] = exactSearch(ctx, acceptDocs, null); From 63543a206d93f1ef900fe3eff37eeeb40a7f1f55 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 6 Sep 2024 14:22:18 -0700 Subject: [PATCH 04/13] parent join working --- src/main/knn/KnnGraphTester.java | 67 ++++++---------------- src/main/knn/KnnIndexer.java | 9 +-- src/main/knn/KnnTesterUtils.java | 30 ++++++++++ src/main/knn/ParentJoinBenchmarkQuery.java | 4 -- 4 files changed, 48 insertions(+), 62 deletions(-) create mode 100644 src/main/knn/KnnTesterUtils.java diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 01c1d4239..e9aa0baef 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -54,26 +54,17 @@ import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; -import org.apache.lucene.document.Document; -import org.apache.lucene.document.FieldType; -import org.apache.lucene.document.KnnByteVectorField; -import org.apache.lucene.document.KnnFloatVectorField; -import org.apache.lucene.document.StoredField; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; -import org.apache.lucene.index.QueryTimeout; import org.apache.lucene.index.StoredFields; -import org.apache.lucene.index.Term; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; -import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.FilteredDocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.KnnByteVectorQuery; import org.apache.lucene.search.KnnFloatVectorQuery; @@ -83,20 +74,12 @@ import org.apache.lucene.search.ScoreMode; import org.apache.lucene.search.Scorer; import org.apache.lucene.search.ScorerSupplier; -import org.apache.lucene.search.TermQuery; import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; import org.apache.lucene.search.Weight; -import org.apache.lucene.search.join.BitSetProducer; -import org.apache.lucene.search.join.CheckJoinIndex; -import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; -import org.apache.lucene.search.join.QueryBitSetProducer; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.BitSet; import org.apache.lucene.util.BitSetIterator; -import org.apache.lucene.util.Bits; -import org.apache.lucene.util.DocIdSetBuilder; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.NamedThreadFactory; import org.apache.lucene.util.PrintStreamInfoStream; @@ -568,6 +551,7 @@ private void printHist(int[] hist, int max, int count, int nbuckets) { private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] nn) throws IOException { TopDocs[] results = new TopDocs[numIters]; + int[][] resultIds = new int[numIters][]; long elapsed, totalCpuTimeMS, totalVisited = 0; ExecutorService executorService = Executors.newFixedThreadPool(8); try (FileChannel input = FileChannel.open(queryPath)) { @@ -634,19 +618,12 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] totalCpuTimeMS = TimeUnit.NANOSECONDS.toMillis(bean.getCurrentThreadCpuTime() - cpuTimeStartNs); elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start); // ns -> ms + + // Fetch, validate and write result document ids. StoredFields storedFields = reader.storedFields(); for (int i = 0; i < numIters; i++) { totalVisited += results[i].totalHits.value; - for (ScoreDoc doc : results[i].scoreDocs) { - if (doc.doc != NO_MORE_DOCS) { - // there is a bug somewhere that can result in doc=NO_MORE_DOCS! I think it happens - // in some degenerate case (like input query has NaN in it?) that causes no results to - // be returned from HNSW search? - doc.doc = Integer.parseInt(storedFields.document(doc.doc).get("id")); - } else { - System.out.println("NO_MORE_DOCS!"); - } - } + resultIds[i] = KnnTesterUtils.getResultIds(results[i], storedFields); } } if (quiet == false) { @@ -666,22 +643,17 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] executorService.shutdown(); } if (outputPath != null) { - ByteBuffer buf = ByteBuffer.allocate(4); - IntBuffer ibuf = buf.order(ByteOrder.LITTLE_ENDIAN).asIntBuffer(); + ByteBuffer tmp = + ByteBuffer.allocate(resultIds[0].length * Integer.BYTES).order(ByteOrder.LITTLE_ENDIAN); try (OutputStream out = Files.newOutputStream(outputPath)) { for (int i = 0; i < numIters; i++) { - for (ScoreDoc doc : results[i].scoreDocs) { - ibuf.position(0); - ibuf.put(doc.doc); - out.write(buf.array()); - } + tmp.asIntBuffer().put(nn[i]); + out.write(tmp.array()); } } } else { - if (quiet == false) { - System.out.println("checking results"); - } - float recall = checkResults(results, nn); + log("checking results"); + float recall = checkResults(resultIds, nn); totalVisited /= numIters; String quantizeDesc; if (quantize) { @@ -726,7 +698,7 @@ private static TopDocs doKnnVectorQuery( return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation), docs.scoreDocs); } - private float checkResults(TopDocs[] results, int[][] nn) { + private float checkResults(int[][] results, int[][] nn) { int totalMatches = 0; int totalResults = results.length * topK; for (int i = 0; i < results.length; i++) { @@ -737,14 +709,14 @@ private float checkResults(TopDocs[] results, int[][] nn) { return totalMatches / (float) totalResults; } - private int compareNN(int[] expected, TopDocs results) { + private int compareNN(int[] expected, int[] results) { int matched = 0; Set expectedSet = new HashSet<>(); for (int i = 0; i < topK; i++) { expectedSet.add(expected[i]); } - for (ScoreDoc scoreDoc : results.scoreDocs) { - if (expectedSet.contains(scoreDoc.doc)) { + for (int docId : results) { + if (expectedSet.contains(docId)) { ++matched; } } @@ -930,15 +902,13 @@ public Void call() { DirectoryReader reader = DirectoryReader.open(dir)) { ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(reader, query, topK); TopDocs topHits = parentJoinQuery.runExactSearch(); - result[queryOrd] = new int[topK]; - int k = 0; - for (ScoreDoc scoreDoc : topHits.scoreDocs) { - result[queryOrd][k++] = scoreDoc.doc; - } + StoredFields storedFields = reader.storedFields(); + result[queryOrd] = KnnTesterUtils.getResultIds(topHits, storedFields); } catch (IOException e) { throw new RuntimeException(e); } } else { + // TODO: Use exactSearch here? NeighborQueue queue = new NeighborQueue(topK, false); try (FileChannel in = FileChannel.open(docPath)) { VectorReader docReader = (VectorReader) VectorReader.create(in, dim, VectorEncoding.FLOAT32); @@ -962,9 +932,6 @@ public Void call() { throw new RuntimeException(e); } } - if ((queryOrd + 1) % 10 == 0) { - log("(parentJoin=%s) top-%d results for iteration %d: %s", parentJoin, topK, queryOrd, Arrays.toString(result[queryOrd])); - } return null; } } diff --git a/src/main/knn/KnnIndexer.java b/src/main/knn/KnnIndexer.java index 61f768880..336b72d15 100644 --- a/src/main/knn/KnnIndexer.java +++ b/src/main/knn/KnnIndexer.java @@ -17,8 +17,6 @@ package knn; -import knn.KnnGraphTester; -import knn.VectorReader; import org.apache.lucene.codecs.Codec; import org.apache.lucene.document.*; import org.apache.lucene.index.IndexWriter; @@ -26,7 +24,6 @@ import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.util.PrintStreamInfoStream; import java.io.BufferedReader; import java.io.IOException; @@ -105,6 +102,7 @@ public int createIndex() throws IOException { seekToStartDoc(in, dim, vectorEncoding, docsStartIndex); } VectorReader vectorReader = VectorReader.create(in, dim, vectorEncoding); + log("parentJoin=%s", parentJoin); if (parentJoin == false) { for (int i = 0; i < numDocs; i++) { Document doc = new Document(); @@ -164,10 +162,6 @@ public int createIndex() throws IOException { block.add(parent); iw.addDocuments(block); parentDocs++; - if (parentDocs % 1000 == 0) { - String subDocs = block.stream().map(d -> d.get(KnnGraphTester.WIKI_PARA_ID_FIELD)).collect(Collectors.joining("-")); - log("parentDocId = %s, numSubDocs = %d, subDocs = %s", currWikiId, block.size() - 1, subDocs); - } // create new block for the next article block = new ArrayList<>(); block.add(doc); @@ -191,7 +185,6 @@ public int createIndex() throws IOException { log("Indexed %d documents with %d parent docs. now flush", childDocs, parentDocs); } } -// iw.flush(); } } long elapsed = System.nanoTime() - start; diff --git a/src/main/knn/KnnTesterUtils.java b/src/main/knn/KnnTesterUtils.java new file mode 100644 index 000000000..36e8dd975 --- /dev/null +++ b/src/main/knn/KnnTesterUtils.java @@ -0,0 +1,30 @@ +package knn; + +import org.apache.lucene.index.StoredFields; +import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.TopDocs; + +import java.io.IOException; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; + +public class KnnTesterUtils { + + /** Fetches values for the "id" field from search results + */ + public static int[] getResultIds(TopDocs topDocs, StoredFields storedFields) throws IOException { + int[] resultIds = new int[topDocs.scoreDocs.length]; + int i = 0; + for (ScoreDoc doc : topDocs.scoreDocs) { + if (doc.doc != NO_MORE_DOCS) { + // there is a bug somewhere that can result in doc=NO_MORE_DOCS! I think it happens + // in some degenerate case (like input query has NaN in it?) that causes no results to + // be returned from HNSW search? + resultIds[i++] = Integer.parseInt(storedFields.document(doc.doc).get(KnnGraphTester.ID_FIELD)); + } else { + System.out.println("NO_MORE_DOCS!"); + } + } + return resultIds; + } +} diff --git a/src/main/knn/ParentJoinBenchmarkQuery.java b/src/main/knn/ParentJoinBenchmarkQuery.java index 6039810f5..0b2ea975f 100644 --- a/src/main/knn/ParentJoinBenchmarkQuery.java +++ b/src/main/knn/ParentJoinBenchmarkQuery.java @@ -18,13 +18,10 @@ package knn; import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.QueryTimeout; import org.apache.lucene.index.Term; -import org.apache.lucene.search.DocIdSet; import org.apache.lucene.search.DocIdSetIterator; -import org.apache.lucene.search.FilteredDocIdSetIterator; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; @@ -36,7 +33,6 @@ import org.apache.lucene.search.join.CheckJoinIndex; import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; import org.apache.lucene.search.join.QueryBitSetProducer; -import org.apache.lucene.util.Bits; import java.io.IOException; import java.util.List; From c149e8f6ac9913fbe2bba2fb044c53ae17615d0c Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Fri, 6 Sep 2024 14:34:25 -0700 Subject: [PATCH 05/13] restore default configs --- src/main/knn/KnnGraphTester.java | 2 +- src/python/knnPerfTest.py | 24 ++++++++---------------- 2 files changed, 9 insertions(+), 17 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index e9aa0baef..36d66280f 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -140,7 +140,7 @@ public class KnnGraphTester { private KnnGraphTester() { // set defaults numDocs = 1000; - numIters = 100; + numIters = 1000; dim = 256; topK = 100; numMergeThread = 1; diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py index 82a8f3385..440573c31 100644 --- a/src/python/knnPerfTest.py +++ b/src/python/knnPerfTest.py @@ -6,7 +6,6 @@ import constants import re from common import getLuceneDirFromGradleProperties -import localconstants # Measure vector search recall and latency while exploring hyperparameters @@ -28,10 +27,6 @@ # Where the version of Lucene is that will be tested. Now this will be sourced from gradle.properties LUCENE_CHECKOUT = getLuceneDirFromGradleProperties() -# Where the version of Lucene is that will be tested. Expected to be in the base dir above luceneutil. -# LUCENE_CHECKOUT = 'baseline' -# LUCENE_CHECKOUT = 'candidate' -# LUCENE_CHECKOUT = 'trunk' # e.g. to compile KnnIndexer: # @@ -43,7 +38,7 @@ #'ndoc': (10000, 100000, 1000000), #'ndoc': (10000, 100000, 200000, 500000), #'ndoc': (10000, 100000, 200000, 500000), - 'ndoc': (50_000,), + 'ndoc': (250_000,), #'ndoc': (100000,), #'maxConn': (32, 64, 96), #'maxConn': (64, ), @@ -54,7 +49,7 @@ #'fanout': (20, 100, 250) 'fanout': (20,), #'quantize': None, - # 'quantizeBits': (4, 7, 8), + 'quantizeBits': (4, 7, 8), 'numMergeWorker': (12,), 'numMergeThread': (4,), 'encoding': ('float32',), @@ -96,15 +91,12 @@ def run_knn_benchmark(checkout, values): #dim = 256 #doc_vectors = '/d/electronics_asin_emb.bin' #query_vectors = '/d/electronics_query_vectors.bin' - dim = 768 - doc_vectors = f"{localconstants.BASE_DIR}/data/{'cohere-wikipedia'}-docs-{dim}d.vec" - query_vectors = f"{localconstants.BASE_DIR}/data/{'cohere-wikipedia'}-queries-{dim}d.vec" - parentJoin_meta_file = f"{localconstants.BASE_DIR}/data/{'cohere-wikipedia'}-metadata.csv" # Cohere dataset -# dim = 768 -# doc_vectors = '%s/data/cohere-wikipedia-768.vec' % constants.BASE_DIR -# query_vectors = '%s/data/cohere-wikipedia-queries-768.vec' % constants.BASE_DIR + dim = 768 + doc_vectors = f"{constants.BASE_DIR}/data/{'cohere-wikipedia'}-docs-{dim}d.vec" + query_vectors = f"{constants.BASE_DIR}/data/{'cohere-wikipedia'}-queries-{dim}d.vec" + parentJoin_meta_file = f"{constants.BASE_DIR}/data/{'cohere-wikipedia'}-metadata.csv" cp = benchUtil.classPathToString(benchUtil.getClassPath(checkout)) cmd = constants.JAVA_EXE.split(' ') + ['-cp', cp, '--add-modules', 'jdk.incubator.vector', @@ -139,10 +131,10 @@ def run_knn_benchmark(checkout, values): '-reindex', '-search', query_vectors, #'-metric', 'euclidean', - '-parentJoin', parentJoin_meta_file, + # '-parentJoin', parentJoin_meta_file, # '-numMergeThread', '8', '-numMergeWorker', '8', # '-forceMerge', - # '-quiet' + '-quiet' ] print(f' cmd: {this_cmd}') job = subprocess.Popen(this_cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, encoding='utf-8') From cfe8125ddf7a50e5c7fb4ee8929b34ae38689330 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Sat, 7 Sep 2024 15:31:45 -0700 Subject: [PATCH 06/13] merge main into pj2 --- src/main/knn/KnnGraphTester.java | 130 ++++++++++++++++--------------- src/main/perf/SearchTask.java | 6 +- src/python/knnPerfTest.py | 19 +++-- 3 files changed, 85 insertions(+), 70 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 36d66280f..51ae69639 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -17,8 +17,6 @@ package knn; -import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; - import java.io.IOException; import java.io.OutputStream; import java.lang.management.ManagementFactory; @@ -31,9 +29,9 @@ import java.nio.file.Path; import java.nio.file.Paths; import java.nio.file.attribute.FileTime; +import java.util.ArrayDeque; import java.util.ArrayList; import java.util.Arrays; -import java.util.ArrayDeque; import java.util.Deque; import java.util.HashSet; import java.util.List; @@ -42,16 +40,16 @@ import java.util.Set; import java.util.concurrent.Callable; import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; import java.util.concurrent.ForkJoinPool; import java.util.concurrent.TimeUnit; -import java.util.concurrent.Executors; import org.apache.lucene.codecs.Codec; import org.apache.lucene.codecs.KnnVectorsFormat; import org.apache.lucene.codecs.KnnVectorsReader; import org.apache.lucene.codecs.lucene912.Lucene912Codec; -import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswScalarQuantizedVectorsFormat; +import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsFormat; import org.apache.lucene.codecs.lucene99.Lucene99HnswVectorsReader; import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.index.CodecReader; @@ -63,6 +61,7 @@ import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; +import org.apache.lucene.internal.hppc.IntIntHashMap; import org.apache.lucene.search.ConstantScoreScorer; import org.apache.lucene.search.ConstantScoreWeight; import org.apache.lucene.search.IndexSearcher; @@ -79,6 +78,7 @@ import org.apache.lucene.search.Weight; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.MMapDirectory; import org.apache.lucene.util.BitSetIterator; import org.apache.lucene.util.FixedBitSet; import org.apache.lucene.util.NamedThreadFactory; @@ -86,8 +86,9 @@ import org.apache.lucene.util.SuppressForbidden; import org.apache.lucene.util.hnsw.HnswGraph; import org.apache.lucene.util.hnsw.NeighborQueue; + +import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS; //TODO Lucene may make these unavailable, we should pull in this from hppc directly -import org.apache.lucene.internal.hppc.IntIntHashMap; // e.g. to compile with zero build tooling!: // @@ -564,71 +565,71 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] long start; ThreadMXBean bean = ManagementFactory.getThreadMXBean(); long cpuTimeStartNs; - try (Directory dir = FSDirectory.open(indexPath); - DirectoryReader reader = DirectoryReader.open(dir)) { - IndexSearcher searcher = new IndexSearcher(reader); - numDocs = reader.maxDoc(); - Query bitSetQuery = prefilter ? new BitSetQuery(matchDocs) : null; - for (int i = 0; i < numIters; i++) { - // warm up - if (vectorEncoding.equals(VectorEncoding.BYTE)) { - byte[] target = targetReaderByte.nextBytes(); - if (prefilter) { - doKnnByteVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery); - } else { - doKnnByteVectorQuery(searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null); - } - } else { - float[] target = targetReader.next(); - if (prefilter) { - doKnnVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery, parentJoin); + try (MMapDirectory dir = new MMapDirectory(indexPath)) { + dir.setPreload((x, ctx) -> x.endsWith(".vec") || x.endsWith(".veq")); + try (DirectoryReader reader = DirectoryReader.open(dir)) { + IndexSearcher searcher = new IndexSearcher(reader); + numDocs = reader.maxDoc(); + Query bitSetQuery = prefilter ? new BitSetQuery(matchDocs) : null; + for (int i = 0; i < numIters; i++) { + // warm up + if (vectorEncoding.equals(VectorEncoding.BYTE)) { + byte[] target = targetReaderByte.nextBytes(); + if (prefilter) { + doKnnByteVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery); + } else { + doKnnByteVectorQuery(searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null); + } } else { - doKnnVectorQuery(searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null, parentJoin); + float[] target = targetReader.next(); + if (prefilter) { + doKnnVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery, parentJoin); + } else { + doKnnVectorQuery(searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null, parentJoin); + } } } - } - targetReader.reset(); - start = System.nanoTime(); - cpuTimeStartNs = bean.getCurrentThreadCpuTime(); - for (int i = 0; i < numIters; i++) { - if (vectorEncoding.equals(VectorEncoding.BYTE)) { - byte[] target = targetReaderByte.nextBytes(); - if (prefilter) { - results[i] = doKnnByteVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery); - } else { - results[i] = doKnnByteVectorQuery(searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null); - } - } else { - float[] target = targetReader.next(); - if (prefilter) { - results[i] = doKnnVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery, parentJoin); + targetReader.reset(); + start = System.nanoTime(); + cpuTimeStartNs = bean.getCurrentThreadCpuTime(); + for (int i = 0; i < numIters; i++) { + if (vectorEncoding.equals(VectorEncoding.BYTE)) { + byte[] target = targetReaderByte.nextBytes(); + if (prefilter) { + results[i] = doKnnByteVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery); + } else { + results[i] = doKnnByteVectorQuery(searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null); + } } else { - results[i] = - doKnnVectorQuery( - searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null, parentJoin); - } - } - if (prefilter == false && matchDocs != null) { - results[i].scoreDocs = - Arrays.stream(results[i].scoreDocs) + float[] target = targetReader.next(); + if (prefilter) { + results[i] = doKnnVectorQuery(searcher, KNN_FIELD, target, topK, fanout, bitSetQuery, parentJoin); + } else { + results[i] = + doKnnVectorQuery( + searcher, KNN_FIELD, target, (int) (topK / selectivity), fanout, null, parentJoin); + } + if (prefilter == false && matchDocs != null) { + results[i].scoreDocs = + Arrays.stream(results[i].scoreDocs) .filter(scoreDoc -> matchDocs.get(scoreDoc.doc)) .toArray(ScoreDoc[]::new); + } + } } - } - totalCpuTimeMS = + totalCpuTimeMS = TimeUnit.NANOSECONDS.toMillis(bean.getCurrentThreadCpuTime() - cpuTimeStartNs); - elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start); // ns -> ms + elapsed = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - start); // ns -> ms - // Fetch, validate and write result document ids. - StoredFields storedFields = reader.storedFields(); - for (int i = 0; i < numIters; i++) { - totalVisited += results[i].totalHits.value; - resultIds[i] = KnnTesterUtils.getResultIds(results[i], storedFields); - } - } - if (quiet == false) { - System.out.println( - "completed " + // Fetch, validate and write result document ids. + StoredFields storedFields = reader.storedFields(); + for (int i = 0; i < numIters; i++) { + totalVisited += results[i].totalHits.value; + resultIds[i] = KnnTesterUtils.getResultIds(results[i], storedFields); + } + if (quiet == false) { + System.out.println( + "completed " + numIters + " searches in " + elapsed @@ -638,6 +639,8 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] + "CPU time=" + totalCpuTimeMS + "ms"); + } + } } } finally { executorService.shutdown(); @@ -663,10 +666,11 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] } System.out.printf( Locale.ROOT, - "SUMMARY: %5.3f\t%5.2f\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%.2f\t%s\n", + "SUMMARY: %5.3f\t%5.2f\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%.2f\t%s\n", recall, totalCpuTimeMS / (float) numIters, numDocs, + topK, fanout, maxConn, beamWidth, diff --git a/src/main/perf/SearchTask.java b/src/main/perf/SearchTask.java index b448d4996..ee6240c2b 100644 --- a/src/main/perf/SearchTask.java +++ b/src/main/perf/SearchTask.java @@ -22,6 +22,7 @@ import org.apache.lucene.facet.FacetResult; import org.apache.lucene.facet.Facets; import org.apache.lucene.facet.FacetsCollector; +import org.apache.lucene.facet.FacetsCollectorManager; import org.apache.lucene.facet.range.LongRange; import org.apache.lucene.facet.range.LongRangeFacetCounts; import org.apache.lucene.facet.sortedset.SortedSetDocValuesFacetCounts; @@ -241,8 +242,9 @@ public void go(IndexState state, TaskParser taskParser) throws IOException { getFacetResultsMsec = (System.nanoTime() - t0)/1000000.0; } else { facetResults = new ArrayList(); - FacetsCollector fc = new FacetsCollector(); - hits = FacetsCollector.search(searcher, q, 10, fc); + FacetsCollectorManager.FacetsResult fr = FacetsCollectorManager.search(searcher, q, 10, new FacetsCollectorManager()); + hits = fr.topDocs(); + FacetsCollector fc = fr.facetsCollector(); long t0 = System.nanoTime(); for(String request : facetRequests) { if (request.startsWith("range:")) { diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py index 08a407885..2ea6d79cd 100644 --- a/src/python/knnPerfTest.py +++ b/src/python/knnPerfTest.py @@ -156,27 +156,36 @@ def run_knn_benchmark(checkout, values): all_results.append(summary) print('\nResults:') - header = 'recall\tlatency (ms)\tnDoc\tfanout\tmaxConn\tbeamWidth\tquantized\tvisited\tindex ms\tselectivity\tfilterType' + header = 'recall\tlatency (ms)\tnDoc\ttopK\tfanout\tmaxConn\tbeamWidth\tquantized\tvisited\tindex ms\tselectivity\tfilterType' # crazy logic to make everything fixed width so rendering in fixed width font "aligns": - num_columns = len(header.split('\t')) + headers = header.split('\t') + num_columns = len(headers) # print(f'{num_columns} columns') max_by_col = [0] * num_columns rows_to_print = [header] + all_results + # TODO: be more careful when we skip/show headers e.g. if some of the runs involve filtering, + # turn filterType/selectivity back on for all runs + skip_headers = {'selectivity', 'filterType', 'visited'} + + skip_column_index = {headers.index(h) for h in skip_headers} + for row in rows_to_print: by_column = row.split('\t') if len(by_column) != num_columns: - raise RuntimeError(f'wrong number of columns: expected {num_columns} but got {len(by_column)}') + raise RuntimeError(f'wrong number of columns: expected {num_columns} but got {len(by_column)} in "{row}"') for i, s in enumerate(by_column): max_by_col[i] = max(max_by_col[i], len(s)) - row_fmt = ' '.join([f'%{max_by_col[i]}s' for i in range(num_columns)]) + row_fmt = ' '.join([f'%{max_by_col[i]}s' for i in range(num_columns) if i not in skip_column_index]) # print(f'using row format {row_fmt}') for row in rows_to_print: - print(row_fmt % tuple(row.split('\t'))) + cols = row.split('\t') + cols = tuple(cols[x] for x in range(len(cols)) if x not in skip_column_index) + print(row_fmt % cols) run_knn_benchmark(LUCENE_CHECKOUT, PARAMS) From 55e19c498cb6d6c6a4a6ac6130457894e0e9cc37 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Tue, 10 Sep 2024 11:33:27 -0700 Subject: [PATCH 07/13] merge in changes from main --- src/main/knn/KnnGraphTester.java | 18 +++++++++++++++++- src/main/knn/VectorReader.java | 6 +++++- src/python/knnPerfTest.py | 2 +- 3 files changed, 23 insertions(+), 3 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 51ae69639..4138bc53d 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -54,10 +54,12 @@ import org.apache.lucene.codecs.perfield.PerFieldKnnVectorsFormat; import org.apache.lucene.index.CodecReader; import org.apache.lucene.index.DirectoryReader; +import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.IndexWriterConfig; import org.apache.lucene.index.LeafReader; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.StandardDirectoryReader; import org.apache.lucene.index.StoredFields; import org.apache.lucene.index.VectorEncoding; import org.apache.lucene.index.VectorSimilarityFunction; @@ -121,6 +123,8 @@ public class KnnGraphTester { private boolean reindex; private boolean forceMerge; private int reindexTimeMsec; + private int indexNumSegments; + private double indexSizeOnDiskMB; private int beamWidth; private int maxConn; private boolean quantize; @@ -386,6 +390,16 @@ private void run(String... args) throws Exception { if (forceMerge) { forceMerge(); } + try (Directory dir = FSDirectory.open(indexPath); IndexReader reader = DirectoryReader.open(dir)) { + indexNumSegments = reader.leaves().size(); + System.out.println("index has " + indexNumSegments + " segments"); + long indexSizeOnDiskBytes = 0; + for(String fileName : ((StandardDirectoryReader) reader).getSegmentInfos().files(true)) { + indexSizeOnDiskBytes += dir.fileLength(fileName); + } + indexSizeOnDiskMB = indexSizeOnDiskBytes / 1024. / 1024.; + System.out.println(String.format(Locale.ROOT, "index disk uage is %.2f MB", indexSizeOnDiskMB)); + } if (operation != null) { switch (operation) { case "-search": @@ -666,7 +680,7 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] } System.out.printf( Locale.ROOT, - "SUMMARY: %5.3f\t%5.2f\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%.2f\t%s\n", + "SUMMARY: %5.3f\t%5.3f\t%d\t%d\t%d\t%d\t%d\t%s\t%d\t%d\t%d\t%.2f\t%.2f\t%s\n", recall, totalCpuTimeMS / (float) numIters, numDocs, @@ -677,6 +691,8 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] quantizeDesc, totalVisited, reindexTimeMsec, + indexNumSegments, + indexSizeOnDiskMB, selectivity, prefilter ? "pre-filter" : "post-filter"); } diff --git a/src/main/knn/VectorReader.java b/src/main/knn/VectorReader.java index 8e4d48055..d3495d12c 100644 --- a/src/main/knn/VectorReader.java +++ b/src/main/knn/VectorReader.java @@ -50,8 +50,12 @@ void reset() throws IOException { protected final void readNext() throws IOException { int bytesRead = this.input.read(bytes); if (bytesRead < bytes.capacity()) { + // wrap around back to the start of the file if we hit the end: this.input.position(0); - this.input.read(bytes); + bytesRead = this.input.read(bytes); + if (bytesRead < bytes.capacity()) { + throw new IllegalStateException("vector file " + input + " doesn't even have enough bytes for a single vector? got bytesRead=" + bytesRead); + } } bytes.position(0); } diff --git a/src/python/knnPerfTest.py b/src/python/knnPerfTest.py index 2ea6d79cd..f6bebdd10 100644 --- a/src/python/knnPerfTest.py +++ b/src/python/knnPerfTest.py @@ -156,7 +156,7 @@ def run_knn_benchmark(checkout, values): all_results.append(summary) print('\nResults:') - header = 'recall\tlatency (ms)\tnDoc\ttopK\tfanout\tmaxConn\tbeamWidth\tquantized\tvisited\tindex ms\tselectivity\tfilterType' + header = 'recall\tlatency (ms)\tnDoc\ttopK\tfanout\tmaxConn\tbeamWidth\tquantized\tvisited\tindex ms\tnum segments\tindex size (MB)\tselectivity\tfilterType' # crazy logic to make everything fixed width so rendering in fixed width font "aligns": headers = header.split('\t') From e828a40d29a98d9627a167f00a9acdf55cd246d4 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Tue, 10 Sep 2024 12:15:32 -0700 Subject: [PATCH 08/13] remove indexreader from parentJoin query --- src/main/knn/KnnGraphTester.java | 120 ++++++++++++++------- src/main/knn/KnnTesterUtils.java | 17 +++ src/main/knn/ParentJoinBenchmarkQuery.java | 27 ++--- 3 files changed, 105 insertions(+), 59 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 4138bc53d..30d41e5a2 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -78,6 +78,7 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.TotalHits; import org.apache.lucene.search.Weight; +import org.apache.lucene.search.join.CheckJoinIndex; import org.apache.lucene.store.Directory; import org.apache.lucene.store.FSDirectory; import org.apache.lucene.store.MMapDirectory; @@ -710,7 +711,7 @@ private static TopDocs doKnnVectorQuery( IndexSearcher searcher, String field, float[] vector, int k, int fanout, Query filter, boolean isParentJoinQuery) throws IOException { if (isParentJoinQuery) { - ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(searcher.getIndexReader(), vector, k); + ParentJoinBenchmarkQuery parentJoinQuery = new ParentJoinBenchmarkQuery(vector, null, k); return searcher.search(parentJoinQuery, k); } ProfiledKnnFloatVectorQuery profiledQuery = new ProfiledKnnFloatVectorQuery(field, vector, k, fanout, filter); @@ -888,15 +889,33 @@ private int[][] computeNN(Path docPath, Path queryPath) throws IOException, InterruptedException { int[][] result = new int[numIters][]; log("computing true nearest neighbors of " + numIters + " target vectors"); - List tasks = new ArrayList<>(); - try (FileChannel qIn = FileChannel.open(queryPath)) { - VectorReader queryReader = (VectorReader) VectorReader.create(qIn, dim, VectorEncoding.FLOAT32); - for (int i = 0; i < numIters; i++) { - float[] query = queryReader.next().clone(); - tasks.add(new ComputeNNFloatTask(i, query, docPath, result)); + log("parentJoin = %s", parentJoin); + if (parentJoin) { + try (Directory dir = FSDirectory.open(indexPath); + DirectoryReader reader = DirectoryReader.open(dir)) { + CheckJoinIndex.check(reader, ParentJoinBenchmarkQuery.parentsFilter); + List tasks = new ArrayList<>(); + try (FileChannel qIn = FileChannel.open(queryPath)) { + VectorReader queryReader = (VectorReader) VectorReader.create(qIn, dim, VectorEncoding.FLOAT32); + for (int i = 0; i < numIters; i++) { + float[] query = queryReader.next().clone(); + tasks.add(new ComputeExactSearchNNFloatTask(i, query, docPath, result, reader)); + } + } + ForkJoinPool.commonPool().invokeAll(tasks); + } + } else { + // TODO: Use exactSearch here? + List tasks = new ArrayList<>(); + try (FileChannel qIn = FileChannel.open(queryPath)) { + VectorReader queryReader = (VectorReader) VectorReader.create(qIn, dim, VectorEncoding.FLOAT32); + for (int i = 0; i < numIters; i++) { + float[] query = queryReader.next().clone(); + tasks.add(new ComputeNNFloatTask(i, query, docPath, result)); + } } + ForkJoinPool.commonPool().invokeAll(tasks); } - ForkJoinPool.commonPool().invokeAll(tasks); return result; } @@ -916,41 +935,60 @@ class ComputeNNFloatTask implements Callable { @Override public Void call() { - if (parentJoin) { - // Use DiversifyingChildrenFloatKnnVectorQuery for parentJoins - try (Directory dir = FSDirectory.open(indexPath); - DirectoryReader reader = DirectoryReader.open(dir)) { - ParentJoinBenchmarkQuery parentJoinQuery = ParentJoinBenchmarkQuery.create(reader, query, topK); - TopDocs topHits = parentJoinQuery.runExactSearch(); - StoredFields storedFields = reader.storedFields(); - result[queryOrd] = KnnTesterUtils.getResultIds(topHits, storedFields); - } catch (IOException e) { - throw new RuntimeException(e); - } - } else { - // TODO: Use exactSearch here? - NeighborQueue queue = new NeighborQueue(topK, false); - try (FileChannel in = FileChannel.open(docPath)) { - VectorReader docReader = (VectorReader) VectorReader.create(in, dim, VectorEncoding.FLOAT32); - for (int j = 0; j < numDocs; j++) { - float[] doc = docReader.next(); - float d = similarityFunction.compare(query, doc); - if (matchDocs == null || matchDocs.get(j)) { - queue.insertWithOverflow(j, d); - } - } - docReader.reset(); - result[queryOrd] = new int[topK]; - for (int k = topK - 1; k >= 0; k--) { - result[queryOrd][k] = queue.topNode(); - queue.pop(); + NeighborQueue queue = new NeighborQueue(topK, false); + try (FileChannel in = FileChannel.open(docPath)) { + VectorReader docReader = (VectorReader) VectorReader.create(in, dim, VectorEncoding.FLOAT32); + for (int j = 0; j < numDocs; j++) { + float[] doc = docReader.next(); + float d = similarityFunction.compare(query, doc); + if (matchDocs == null || matchDocs.get(j)) { + queue.insertWithOverflow(j, d); } - if ((queryOrd + 1) % 10 == 0) { - log(" " + (queryOrd + 1)); - } - } catch (IOException e) { - throw new RuntimeException(e); } + docReader.reset(); + result[queryOrd] = new int[topK]; + for (int k = topK - 1; k >= 0; k--) { + result[queryOrd][k] = queue.topNode(); + queue.pop(); + } + if ((queryOrd + 1) % 10 == 0) { + log(" " + (queryOrd + 1)); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + return null; + } + } + + /** Uses ExactSearch from Lucene queries to compute nearest neighbors. + */ + class ComputeExactSearchNNFloatTask implements Callable { + + private final int queryOrd; + private final float[] query; + private final Path docPath; + private final int[][] result; + private final IndexReader reader; + + ComputeExactSearchNNFloatTask(int queryOrd, float[] query, Path docPath, int[][] result, IndexReader reader) { + this.queryOrd = queryOrd; + this.query = query; + this.docPath = docPath; + this.result = result; + this.reader = reader; + } + + @Override + public Void call() { + // we only use this for ParentJoin benchmarks right now, TODO: extend for all computeNN needs. + try { + ParentJoinBenchmarkQuery parentJoinQuery = new ParentJoinBenchmarkQuery(query, null, topK); + TopDocs topHits = ParentJoinBenchmarkQuery.runExactSearch(reader, parentJoinQuery); + StoredFields storedFields = reader.storedFields(); + result[queryOrd] = KnnTesterUtils.getResultIds(topHits, storedFields); + } catch (IOException e) { + throw new RuntimeException(e); } return null; } diff --git a/src/main/knn/KnnTesterUtils.java b/src/main/knn/KnnTesterUtils.java index 36e8dd975..d43d25acc 100644 --- a/src/main/knn/KnnTesterUtils.java +++ b/src/main/knn/KnnTesterUtils.java @@ -1,3 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + package knn; import org.apache.lucene.index.StoredFields; diff --git a/src/main/knn/ParentJoinBenchmarkQuery.java b/src/main/knn/ParentJoinBenchmarkQuery.java index 0b2ea975f..507663d6a 100644 --- a/src/main/knn/ParentJoinBenchmarkQuery.java +++ b/src/main/knn/ParentJoinBenchmarkQuery.java @@ -30,7 +30,6 @@ import org.apache.lucene.search.TopDocs; import org.apache.lucene.search.Weight; import org.apache.lucene.search.join.BitSetProducer; -import org.apache.lucene.search.join.CheckJoinIndex; import org.apache.lucene.search.join.DiversifyingChildrenFloatKnnVectorQuery; import org.apache.lucene.search.join.QueryBitSetProducer; @@ -41,20 +40,13 @@ public class ParentJoinBenchmarkQuery extends DiversifyingChildrenFloatKnnVectorQuery { - IndexReader reader; - int topK; + public static final BitSetProducer parentsFilter = + new QueryBitSetProducer(new TermQuery(new Term(DOCTYPE_FIELD, DOCTYPE_PARENT))); - static ParentJoinBenchmarkQuery create(IndexReader reader, float[] queryVector, int topK) throws IOException { - BitSetProducer parentsFilter = - new QueryBitSetProducer(new TermQuery(new Term(DOCTYPE_FIELD, DOCTYPE_PARENT))); - CheckJoinIndex.check(reader, parentsFilter); - return new ParentJoinBenchmarkQuery(reader, queryVector, null, topK, parentsFilter); - } + private static final TermQuery childDocQuery = new TermQuery(new Term(DOCTYPE_FIELD, DOCTYPE_CHILD)); - ParentJoinBenchmarkQuery(IndexReader reader, float[] query, Query childFilter, int k, BitSetProducer parentsFilter) throws IOException { - super(KNN_FIELD, query, childFilter, k, parentsFilter); - this.reader = reader; - this.topK = k; + ParentJoinBenchmarkQuery(float[] queryVector, Query childFilter, int k) throws IOException { + super(KNN_FIELD, queryVector, childFilter, k, parentsFilter); } // expose for benchmarking @@ -63,16 +55,15 @@ public TopDocs exactSearch(LeafReaderContext context, DocIdSetIterator acceptIte return super.exactSearch(context, acceptIterator, queryTimeout); } - public TopDocs runExactSearch() throws IOException { + public static TopDocs runExactSearch(IndexReader reader, ParentJoinBenchmarkQuery query) throws IOException { IndexSearcher searcher = new IndexSearcher(reader); List leafReaderContexts = reader.leaves(); TopDocs[] perLeafResults = new TopDocs[leafReaderContexts.size()]; int leaf = 0; for (LeafReaderContext ctx : leafReaderContexts) { - TermQuery children = new TermQuery(new Term(DOCTYPE_FIELD, DOCTYPE_CHILD)); - Weight childrenWeight = children.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f); + Weight childrenWeight = childDocQuery.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1f); DocIdSetIterator acceptDocs = childrenWeight.scorer(ctx).iterator(); - perLeafResults[leaf] = exactSearch(ctx, acceptDocs, null); + perLeafResults[leaf] = query.exactSearch(ctx, acceptDocs, null); if (ctx.docBase > 0) { for (ScoreDoc scoreDoc : perLeafResults[leaf].scoreDocs) { scoreDoc.doc += ctx.docBase; @@ -80,6 +71,6 @@ public TopDocs runExactSearch() throws IOException { } leaf++; } - return super.mergeLeafResults(perLeafResults); + return query.mergeLeafResults(perLeafResults); } } From 399442ac65785a44f0dea42bc20594db540abac9 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Tue, 10 Sep 2024 13:38:15 -0700 Subject: [PATCH 09/13] docstring for ParentJoinBenchmarkQuery --- src/main/knn/ParentJoinBenchmarkQuery.java | 2 ++ 1 file changed, 2 insertions(+) diff --git a/src/main/knn/ParentJoinBenchmarkQuery.java b/src/main/knn/ParentJoinBenchmarkQuery.java index 507663d6a..65b1334c7 100644 --- a/src/main/knn/ParentJoinBenchmarkQuery.java +++ b/src/main/knn/ParentJoinBenchmarkQuery.java @@ -38,6 +38,8 @@ import static knn.KnnGraphTester.*; +/** Exposes functions to directly invoke {@link DiversifyingChildrenFloatKnnVectorQuery#exactSearch} + */ public class ParentJoinBenchmarkQuery extends DiversifyingChildrenFloatKnnVectorQuery { public static final BitSetProducer parentsFilter = From f7e69009d08ad28cf616b530139894a22e005da8 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Tue, 10 Sep 2024 13:54:19 -0700 Subject: [PATCH 10/13] fix condition styling --- src/main/knn/KnnGraphTester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 30d41e5a2..ed7ade4d4 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -365,7 +365,7 @@ private void run(String... args) throws Exception { indexPath = Paths.get(formatIndexPath(docVectorsPath)); // derive index path log("Index Path = %s", indexPath); } - if (parentJoin && !reindex && !isParentJoinIndex(indexPath)) { + if (parentJoin && reindex == false && isParentJoinIndex(indexPath) == false) { throw new IllegalArgumentException("Provided index: [" + indexPath + "] does not have parent-child " + "document relationships. Rerun with -reindex or without -parentJoin argument"); } From 93fa53b81c441c6285c58d93cb45efd101ae2e93 Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Tue, 17 Sep 2024 11:34:30 -0700 Subject: [PATCH 11/13] remove dups from merges --- src/main/knn/KnnGraphTester.java | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 767650684..08dcc9f7e 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -404,16 +404,6 @@ private void run(String... args) throws Exception { indexSizeOnDiskMB = indexSizeOnDiskBytes / 1024. / 1024.; System.out.println(String.format(Locale.ROOT, "index disk uage is %.2f MB", indexSizeOnDiskMB)); } - try (Directory dir = FSDirectory.open(indexPath); IndexReader reader = DirectoryReader.open(dir)) { - indexNumSegments = reader.leaves().size(); - System.out.println("index has " + indexNumSegments + " segments"); - long indexSizeOnDiskBytes = 0; - for(String fileName : ((StandardDirectoryReader) reader).getSegmentInfos().files(true)) { - indexSizeOnDiskBytes += dir.fileLength(fileName); - } - indexSizeOnDiskMB = indexSizeOnDiskBytes / 1024. / 1024.; - System.out.println(String.format(Locale.ROOT, "index disk uage is %.2f MB", indexSizeOnDiskMB)); - } if (operation != null) { switch (operation) { case "-search": @@ -734,7 +724,7 @@ private static TopDocs doKnnByteVectorQuery( throws IOException { ProfiledKnnByteVectorQuery profiledQuery = new ProfiledKnnByteVectorQuery(field, vector, k, fanout, filter); TopDocs docs = searcher.search(profiledQuery, k); - return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation()), docs.scoreDocs); + return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation), docs.scoreDocs); } private static TopDocs doKnnVectorQuery( @@ -746,7 +736,7 @@ private static TopDocs doKnnVectorQuery( } ProfiledKnnFloatVectorQuery profiledQuery = new ProfiledKnnFloatVectorQuery(field, vector, k, fanout, filter); TopDocs docs = searcher.search(profiledQuery, k); - return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation()), docs.scoreDocs); + return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation), docs.scoreDocs); } private float checkResults(int[][] results, int[][] nn) { From 370659263e2360f26c6e303653de33c11b9d7f3c Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Tue, 17 Sep 2024 11:45:55 -0700 Subject: [PATCH 12/13] Use TotalHits record --- src/main/knn/KnnGraphTester.java | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 08dcc9f7e..5159c7bff 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -724,7 +724,7 @@ private static TopDocs doKnnByteVectorQuery( throws IOException { ProfiledKnnByteVectorQuery profiledQuery = new ProfiledKnnByteVectorQuery(field, vector, k, fanout, filter); TopDocs docs = searcher.search(profiledQuery, k); - return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation), docs.scoreDocs); + return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation()), docs.scoreDocs); } private static TopDocs doKnnVectorQuery( @@ -736,7 +736,7 @@ private static TopDocs doKnnVectorQuery( } ProfiledKnnFloatVectorQuery profiledQuery = new ProfiledKnnFloatVectorQuery(field, vector, k, fanout, filter); TopDocs docs = searcher.search(profiledQuery, k); - return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation), docs.scoreDocs); + return new TopDocs(new TotalHits(profiledQuery.totalVectorCount(), docs.totalHits.relation()), docs.scoreDocs); } private float checkResults(int[][] results, int[][] nn) { From 8d37090575f3035e1f375d2b2f69765b90016cec Mon Sep 17 00:00:00 2001 From: Vigya Sharma Date: Tue, 17 Sep 2024 11:49:10 -0700 Subject: [PATCH 13/13] update TotalHits access to use record type in java --- src/main/knn/KnnGraphTester.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/main/knn/KnnGraphTester.java b/src/main/knn/KnnGraphTester.java index 5159c7bff..a1e243b9b 100644 --- a/src/main/knn/KnnGraphTester.java +++ b/src/main/knn/KnnGraphTester.java @@ -650,7 +650,7 @@ private void testSearch(Path indexPath, Path queryPath, Path outputPath, int[][] // Fetch, validate and write result document ids. StoredFields storedFields = reader.storedFields(); for (int i = 0; i < numIters; i++) { - totalVisited += results[i].totalHits.value; + totalVisited += results[i].totalHits.value(); resultIds[i] = KnnTesterUtils.getResultIds(results[i], storedFields); } if (quiet == false) {