From e09f004691592ef0b01fae3ac4aca92012bfefe3 Mon Sep 17 00:00:00 2001 From: oshultseva Date: Fri, 26 Apr 2024 15:51:05 +0100 Subject: [PATCH 1/8] vector db test --- java/drivers/driver-hazelcast4plus/pom.xml | 39 +- .../simulator/tests/vector/DatasetReader.java | 145 +++++++ .../tests/vector/NpyDatasetReader.java | 149 +++++++ .../simulator/tests/vector/ScoreMetrics.java | 29 ++ .../vector/VectorCollectionFromUrlTest.java | 151 +++++++ .../VectorCollectionPutDatasetTest.java | 105 +++++ .../vector/VectorCollectionRandomTest.java | 145 +++++++ .../VectorCollectionSearchDatasetTest.java | 143 +++++++ .../vector/readers/HDF5DatasetReader.java | 50 +++ .../readers/NpyArchiveDatasetReader.java | 86 ++++ .../tests/vector/readers/TarExtractor.java | 51 +++ .../simulator/utils/DatasetReaderTest.java | 44 ++ java/pom.xml | 4 +- vector-search-simulator/README.md | 3 + .../vector-test-2/README.md | 36 ++ .../vector-test-2/ansible.cfg | 10 + .../vector-test-2/aws/auto_mount.py | 66 +++ .../vector-test-2/aws/main.tf | 378 ++++++++++++++++++ .../vector-test-2/hazelcast.xml | 40 ++ .../vector-test-2/inventory.yaml | 12 + .../vector-test-2/inventory_plan.yaml | 53 +++ vector-search-simulator/vector-test-2/setup | 14 + .../vector-test-2/teardown | 7 + .../vector-test-2/tests.yaml | 43 ++ .../vectors-datasets-tests-v2.yaml | 52 +++ vector-search-simulator/vector-test/README.md | 36 ++ .../vector-test/aws/auto_mount.py | 66 +++ .../vector-test/aws/main.tf | 378 ++++++++++++++++++ .../vector-test/hazelcast.xml | 40 ++ .../vector-test/inventory.yaml | 12 + .../vector-test/inventory_plan.yaml | 53 +++ vector-search-simulator/vector-test/setup | 14 + vector-search-simulator/vector-test/teardown | 7 + .../vectors-datasets-tests-v2.yaml | 51 +++ .../vector-test/vectors-datasets-tests.yaml | 52 +++ 35 files changed, 2561 insertions(+), 3 deletions(-) create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/NpyDatasetReader.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionFromUrlTest.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionRandomTest.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/TarExtractor.java create mode 100644 java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java create mode 100644 vector-search-simulator/README.md create mode 100644 vector-search-simulator/vector-test-2/README.md create mode 100644 vector-search-simulator/vector-test-2/ansible.cfg create mode 100755 vector-search-simulator/vector-test-2/aws/auto_mount.py create mode 100644 vector-search-simulator/vector-test-2/aws/main.tf create mode 100644 vector-search-simulator/vector-test-2/hazelcast.xml create mode 100644 vector-search-simulator/vector-test-2/inventory.yaml create mode 100644 vector-search-simulator/vector-test-2/inventory_plan.yaml create mode 100755 vector-search-simulator/vector-test-2/setup create mode 100755 vector-search-simulator/vector-test-2/teardown create mode 100644 vector-search-simulator/vector-test-2/tests.yaml create mode 100644 vector-search-simulator/vector-test-2/vectors-datasets-tests-v2.yaml create mode 100644 vector-search-simulator/vector-test/README.md create mode 100755 vector-search-simulator/vector-test/aws/auto_mount.py create mode 100644 vector-search-simulator/vector-test/aws/main.tf create mode 100644 vector-search-simulator/vector-test/hazelcast.xml create mode 100644 vector-search-simulator/vector-test/inventory.yaml create mode 100644 vector-search-simulator/vector-test/inventory_plan.yaml create mode 100755 vector-search-simulator/vector-test/setup create mode 100755 vector-search-simulator/vector-test/teardown create mode 100644 vector-search-simulator/vector-test/vectors-datasets-tests-v2.yaml create mode 100644 vector-search-simulator/vector-test/vectors-datasets-tests.yaml diff --git a/java/drivers/driver-hazelcast4plus/pom.xml b/java/drivers/driver-hazelcast4plus/pom.xml index a355b6d63b..f5fd1fe80c 100644 --- a/java/drivers/driver-hazelcast4plus/pom.xml +++ b/java/drivers/driver-hazelcast4plus/pom.xml @@ -13,7 +13,7 @@ - 5.4.0-SNAPSHOT + 5.5.0-SNAPSHOT ${project.parent.basedir} 4.1.94.Final 2.0.34.Final @@ -71,6 +71,12 @@ ${hazelcast.version} + + com.hazelcast + hazelcast-enterprise-vector + ${hazelcast.version} + + javax.cache cache-api @@ -136,5 +142,36 @@ ${netty.version} + + org.apache.commons + commons-lang3 + 3.14.0 + + + + org.apache.httpcomponents + httpclient + ${httpclient.version} + + + + ai.djl + api + 0.27.0 + + + + org.jetbrains.bio + npy + 0.3.5 + + + + io.jhdf + jhdf + 0.6.10 + + + diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java new file mode 100644 index 0000000000..770aa13ef9 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java @@ -0,0 +1,145 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.simulator.tests.vector.readers.HDF5DatasetReader; +import com.hazelcast.simulator.tests.vector.readers.NpyArchiveDatasetReader; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.http.HttpResponse; +import org.apache.http.client.ResponseHandler; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.client.LaxRedirectStrategy; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.file.Path; + +public abstract class DatasetReader { + + private final URL datasetURL; + + protected final Path workingDirectory; + + protected final File downloadedFile; + + protected float[][] trainDataset; + + protected float[][] testDataset; + + protected int dimension; + + protected int size; + + protected final Logger logger = LogManager.getLogger(getClass()); + + public DatasetReader(String url, String directory) { + try { + this.datasetURL = URI.create(url).toURL(); + this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile())); + this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile(); + + logger.info("Start downloading file from " + datasetURL); + if (!downloadedFile.exists()) { + download(); + } + logger.info("File downloaded to " + downloadedFile + ". Start unpacking..."); + + preprocessDatasetFile(); + parseTrainDataset(); + parseTestDataset(); + logger.info("Dataset reader is initialized"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + protected abstract void preprocessDatasetFile(); + protected abstract void parseTrainDataset(); + + protected abstract void parseTestDataset(); + + private void cleanup() { + try { + FileUtils.cleanDirectory(workingDirectory.toFile()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public float[] getTrainVector(int index) { + return trainDataset[index]; + } + + public float[][] getTestDataset() { + return testDataset; + } + + public int getDimension() { + return dimension; + } + + public int getSize() { + return size; + } + + public int getTestDatasetDimension() { + if(testDataset.length == 0) { + return 0; + } + return testDataset[0].length; + } + + private void download() { + CloseableHttpClient httpClient = HttpClients.custom() + .setRedirectStrategy(new LaxRedirectStrategy()) + .build(); + try { + HttpGet get = new HttpGet(datasetURL.toURI()); + httpClient.execute(get, new FileDownloadResponseHandler(downloadedFile)); + } catch (Exception e) { + throw new IllegalStateException(e); + } finally { + IOUtils.closeQuietly(httpClient); + } + } + + + private static class FileDownloadResponseHandler implements ResponseHandler { + + private final File target; + + public FileDownloadResponseHandler(File target) { + this.target = target; + } + + @Override + public Void handleResponse(HttpResponse response) throws IOException { + InputStream source = response.getEntity().getContent(); + FileUtils.copyInputStreamToFile(source, this.target); + return null; + } + } + + public static DatasetReader create(String url, String directory) { + try { + URL datasetUrl = URI.create(url).toURL(); + var ext = FilenameUtils.getExtension(datasetUrl.getFile()); + return switch (ext) { + case "hdf5" -> new HDF5DatasetReader(url, directory); + case "tgz" -> new NpyArchiveDatasetReader(url, directory); + default -> throw new UnsupportedOperationException("File " + ext + " is not supported"); + }; + } catch (MalformedURLException e) { + throw new RuntimeException(e); + } + + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/NpyDatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/NpyDatasetReader.java new file mode 100644 index 0000000000..9c908abedc --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/NpyDatasetReader.java @@ -0,0 +1,149 @@ +package com.hazelcast.simulator.tests.vector; + +import com.google.gson.JsonArray; +import com.google.gson.JsonParser; +import com.hazelcast.simulator.tests.vector.readers.TarExtractor; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jetbrains.bio.npy.NpyArray; +import org.jetbrains.bio.npy.NpyFile; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.net.URI; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; + +@Deprecated +public class NpyDatasetReader { + + private final URL datasetURL; + + private final Path workingDirectory; + + private final File downloadedFile; + + private final Path vectorsNameFile; + private final Path testNameFile; + + private float[] vectorsPlain; + + private List query; + + private int dimension; + private int size; + + protected final Logger logger = LogManager.getLogger(getClass()); + + public NpyDatasetReader(String url, String directory) { + try { + this.datasetURL = URI.create(url).toURL(); + this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile())); + this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile(); + this.vectorsNameFile = Path.of(workingDirectory.toString(), "vectors.npy"); + this.testNameFile = Path.of(workingDirectory.toString(), "tests.jsonl"); + + logger.info("Start downloading file from " + datasetURL); + if (!downloadedFile.exists()) { + download(); + } + logger.info("File downloaded to " + downloadedFile + ". Start unpacking..."); + + if (!vectorsNameFile.toFile().exists()) { + unpack(); + } + logger.info("Unpacking finished. Start parse vectors..."); + + parseArray(); + parseTestCases(); + logger.info("Dataset reader is initialized"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void download() { + try { + FileUtils.copyURLToFile( + datasetURL, + downloadedFile, + 120_000, + 60_000 * 60); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void unpack() throws IOException { + TarExtractor.extractTarGZ(new FileInputStream(downloadedFile), workingDirectory); + } + + private void parseArray() { + NpyArray read = NpyFile.read(vectorsNameFile, Integer.MAX_VALUE); + var shape = read.getShape(); + size = shape[0]; + dimension = shape[1]; + vectorsPlain = read.asFloatArray(); + } + + private void parseTestCases() { + try { + var parser = new JsonParser(); + List queryList = FileUtils.readLines(testNameFile.toFile(), Charset.defaultCharset()); + query = queryList.stream() + .map(query -> parser.parse(query).getAsJsonObject().getAsJsonArray("query")) + .map(this::convert) + .toList(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private float[] convert(JsonArray array) { + var result = new float[array.size()]; + for (int i = 0; i < array.size(); i++) { + result[i] = array.get(i).getAsFloat(); + } + return result; + } + + private void cleanup() { + try { + FileUtils.cleanDirectory(workingDirectory.toFile()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public float[] read(int index) { + if (index >= size) { + throw new RuntimeException("invalid index"); + } + return Arrays.copyOfRange(vectorsPlain, index * dimension, (index + 1) * dimension); + } + + public List getTestCases() { + return query; + } + + public int getDimension() { + return dimension; + } + + public int getSize() { + return size; + } + + public int getQueryDimension() { + if(query.isEmpty()) { + return 0; + } + return query.get(0).length; + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java new file mode 100644 index 0000000000..7919e99d36 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java @@ -0,0 +1,29 @@ +package com.hazelcast.simulator.tests.vector; + +import org.HdrHistogram.Histogram; + +public class ScoreMetrics { + + private static final Histogram scoreHistogram = new Histogram(100, 3); + + public static void set(int score) { + scoreHistogram.recordValue(score); + } + + public static long getMin() { + return scoreHistogram.getMinValue(); + } + + public static long getMax() { + return scoreHistogram.getMaxValue(); + } + + public static double getMean() { + return scoreHistogram.getMean(); + } + + public static long getPercentLowerThen(int value) { + var lower = scoreHistogram.getCountBetweenValues(0, value); + return (lower * 100) / scoreHistogram.getTotalCount(); + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionFromUrlTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionFromUrlTest.java new file mode 100644 index 0000000000..ac73a73e44 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionFromUrlTest.java @@ -0,0 +1,151 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.config.vector.Metric; +import com.hazelcast.config.vector.VectorCollectionConfig; +import com.hazelcast.config.vector.VectorIndexConfig; +import com.hazelcast.simulator.hz.HazelcastTest; +import com.hazelcast.simulator.test.BaseThreadState; +import com.hazelcast.simulator.test.annotations.AfterRun; +import com.hazelcast.simulator.test.annotations.Setup; +import com.hazelcast.simulator.test.annotations.TimeStep; +import com.hazelcast.vector.SearchOptions; +import com.hazelcast.vector.SearchOptionsBuilder; +import com.hazelcast.vector.VectorCollection; +import com.hazelcast.vector.VectorDocument; +import com.hazelcast.vector.VectorValues; + +import java.util.List; +import java.util.concurrent.CompletableFuture; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import static org.junit.Assert.assertEquals; + +@Deprecated +public class VectorCollectionFromUrlTest extends HazelcastTest { + + public String datasetUrl; + + public String workingDirectory; + + // common parameters + public int numberOfSearchIterations = Integer.MAX_VALUE; + + public int loadFirst = Integer.MAX_VALUE; + + // graph parameters + public String metric = "COSINE"; + + public int maxDegree = 40; + + public int efConstruction = 50; + + // search parameters + + public int limit = 1; + + // inner test parameters + + private static final String collectionName = "performance-collection"; + + private static final int PUT_BATCH_SIZE = 10_000; + private VectorCollection collection; + + private final AtomicInteger counter = new AtomicInteger(0); + private List query; + + @Setup + public void setup() { + NpyDatasetReader reader = new NpyDatasetReader(datasetUrl, workingDirectory); + var size = Math.min(reader.getSize(), loadFirst); + int dimension = reader.getDimension(); + assert dimension == reader.getQueryDimension() : "dataset dimension does not correspond to query vector dimension"; + query = reader.getTestCases(); + numberOfSearchIterations = Math.min(numberOfSearchIterations, query.size()); + + collection = VectorCollection.getCollection( + targetInstance, + new VectorCollectionConfig(collectionName) + .addVectorIndexConfig( + new VectorIndexConfig() + .setMetric(Metric.valueOf(metric)) + .setDimension(dimension) + .setMaxDegree(maxDegree) + .setEfConstruction(efConstruction) + ) + ); + + var start = System.currentTimeMillis(); + for (int i = 0; i < size; i += PUT_BATCH_SIZE) { + logger.info( + String.format( + "Uploaded %s vectors from %s. Time: %s", + i, + size, + TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - start) + ) + ); + int currentBatchLength = Math.min(PUT_BATCH_SIZE, size - i); + int[] keys = new int[currentBatchLength]; + float[][] vectors = new float[currentBatchLength][]; + for (int j = 0; j < currentBatchLength; j++) { + keys[j] = i + j; + vectors[j] = reader.read(i + j); + } + putBatchSync(keys, vectors); + } + logger.info("Collection size: " + size); + logger.info("Collection dimension: " + reader.getDimension()); + logger.info("Index build time(m): " + TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - start)); + } + + @TimeStep(prob = 0) + public void search(ThreadState state) { + var iteration = counter.incrementAndGet(); + if (iteration >= numberOfSearchIterations) { + testContext.stop(); + return; + } + var vector = query.get(iteration); + SearchOptions options = new SearchOptionsBuilder().vector(vector).includePayload().limit(limit).build(); + var result = collection.searchAsync(options).toCompletableFuture().join(); + assertEquals(limit, result.size()); + var score = result.results().next().getScore(); + ScoreMetrics.set((int) (score * 100)); + logger.info("Found score: " + result.results().next().getScore()); + } + + @AfterRun + public void afterRun() { + logger.info("Number of search iteration: " + counter.get()); + logger.info("Min score: " + ScoreMetrics.getMin()); + logger.info("Max score: " + ScoreMetrics.getMax()); + logger.info("Mean score: " + ScoreMetrics.getMean()); + logger.info("Percent lower then 0.98: " + ScoreMetrics.getPercentLowerThen(98)); + } + + private void putSync(int key, float[] vector) { + collection.putAsync( + key, + VectorDocument.of(key, VectorValues.of(vector)) + ) + .toCompletableFuture() + .join(); + } + + private void putBatchSync(int[] keys, float[][] vectors) { + CompletableFuture[] futures = new CompletableFuture[keys.length]; + for (int i = 0; i < keys.length; i++) { + futures[i] = collection.putAsync( + keys[i], + VectorDocument.of(keys[i], VectorValues.of(vectors[i])) + ) + .toCompletableFuture(); + } + CompletableFuture.allOf(futures).join(); + } + + + public static class ThreadState extends BaseThreadState { + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java new file mode 100644 index 0000000000..d2c64a4b33 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java @@ -0,0 +1,105 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.config.vector.Metric; +import com.hazelcast.config.vector.VectorCollectionConfig; +import com.hazelcast.config.vector.VectorIndexConfig; +import com.hazelcast.simulator.hz.HazelcastTest; +import com.hazelcast.simulator.test.BaseThreadState; +import com.hazelcast.simulator.test.annotations.AfterRun; +import com.hazelcast.simulator.test.annotations.Setup; +import com.hazelcast.simulator.test.annotations.TimeStep; +import com.hazelcast.vector.VectorCollection; +import com.hazelcast.vector.VectorDocument; +import com.hazelcast.vector.VectorValues; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +public class VectorCollectionPutDatasetTest extends HazelcastTest { + + public String datasetUrl; + + public String workingDirectory; + + // common parameters + public int loadFirst = Integer.MAX_VALUE; + + public int putBatchSize = 10_000; + + // graph parameters + public String metric = "COSINE"; + + public int maxDegree = 40; + + public int efConstruction = 50; + + // inner test parameters + + private static final String collectionName = "performance-collection"; + private VectorCollection collection; + + private final AtomicInteger counter = new AtomicInteger(0); + + private DatasetReader reader; + + @Setup + public void setup() { + reader = DatasetReader.create(datasetUrl, workingDirectory); + int dimension = reader.getDimension(); + loadFirst = Math.min(loadFirst, reader.getSize()); + + collection = VectorCollection.getCollection( + targetInstance, + new VectorCollectionConfig(collectionName) + .addVectorIndexConfig( + new VectorIndexConfig() + .setMetric(Metric.valueOf(metric)) + .setDimension(dimension) + .setMaxDegree(maxDegree) + .setEfConstruction(efConstruction) + ) + ); + } + + @TimeStep(prob = 0) + public void put(ThreadState state) { + var iteration = counter.incrementAndGet(); + if (iteration >= loadFirst) { + testContext.stop(); + return; + } + var vector = reader.getTrainVector(iteration); + collection.putAsync( + iteration, + VectorDocument.of(iteration, VectorValues.of(vector)) + ) + .toCompletableFuture() + .join(); + } + + @TimeStep(prob = 0) + public void putAll(ThreadState state) { + var iteration = counter.getAndAdd(putBatchSize); + if (iteration >= loadFirst) { + testContext.stop(); + return; + } + Map> buffer = new HashMap<>(); + for (int i = 0; i < putBatchSize; i++) { + var key = iteration + i; + var vector = reader.getTrainVector(key); + buffer.put(key, VectorDocument.of(key, VectorValues.of(vector))); + } + collection.putAllAsync(buffer) + .toCompletableFuture() + .join(); + } + + @AfterRun + public void afterRun() { + } + + public static class ThreadState extends BaseThreadState { + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionRandomTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionRandomTest.java new file mode 100644 index 0000000000..ece9ea1731 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionRandomTest.java @@ -0,0 +1,145 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.config.vector.Metric; +import com.hazelcast.config.vector.VectorCollectionConfig; +import com.hazelcast.config.vector.VectorIndexConfig; +import com.hazelcast.simulator.hz.HazelcastTest; +import com.hazelcast.simulator.test.BaseThreadState; +import com.hazelcast.simulator.test.annotations.Setup; +import com.hazelcast.simulator.test.annotations.TimeStep; +import com.hazelcast.simulator.test.annotations.Verify; +import com.hazelcast.vector.SearchOptions; +import com.hazelcast.vector.SearchOptionsBuilder; +import com.hazelcast.vector.VectorCollection; +import com.hazelcast.vector.VectorDocument; +import com.hazelcast.vector.VectorValues; +import com.hazelcast.vector.impl.SingleIndexVectorValues; + +import java.util.Arrays; +import java.util.Random; +import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Supplier; + +import static java.util.stream.IntStream.range; +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +@Deprecated +public class VectorCollectionRandomTest extends HazelcastTest { + // common parameters + public int numberOfIterations = 0; + public int initialCollectionSize = 0; + + public int minCoordinate = 0; + + public int maxCoordinate = 100; + + public boolean generateOnlyIntCoordinate = false; + + // graph parameters + public int dimension = 10; + + public String metric = "EUCLIDEAN"; + + public int maxDegree = 40; + + public int efConstruction = 50; + + // search parameters + + public int limit = 100; + + + // inner test parameters + + private final String collectionName = "performance-collection"; + private VectorCollection collection; + + private final AtomicInteger counter = new AtomicInteger(initialCollectionSize); + + private final Random RANDOM = new Random(); + private long start; + + @Setup + public void setup() { + collection = VectorCollection.getCollection( + targetInstance, + new VectorCollectionConfig(collectionName) + .addVectorIndexConfig( + new VectorIndexConfig() + .setMetric(Metric.valueOf(metric)) + .setDimension(dimension) + .setMaxDegree(maxDegree) + .setEfConstruction(efConstruction) + ) + ); + + for (int i = 0; i < initialCollectionSize; i++) { + generateVectorAndPutSync(i); + } + start = System.currentTimeMillis(); + logger.info("Generated collection size: " + initialCollectionSize); + logger.info("Start time: " + start); + + } + + @TimeStep + public void put(ThreadState state) { + var key = counter.incrementAndGet(); + if(numberOfIterations > 0 && key >= numberOfIterations - initialCollectionSize) { + testContext.stop(); + return; + } + generateVectorAndPutSync(key); + } + + @TimeStep(prob = 0) + public void search(ThreadState state) { + var iteration = counter.incrementAndGet(); + if(numberOfIterations > 0 && iteration >= numberOfIterations) { + testContext.stop(); + return; + } + var vector = generateVector(dimension, 0, maxCoordinate); + SearchOptions options = new SearchOptionsBuilder().vector(vector).includePayload().limit(limit).build(); + var result = collection.searchAsync(options).toCompletableFuture().join(); + assertEquals(limit, result.size()); + } + + private void generateVectorAndPutSync(int key) { + collection.putAsync( + key, + VectorDocument.of(key, VectorValues.of(generateVector(dimension, minCoordinate, maxCoordinate))) + ) + .toCompletableFuture() + .join(); + } + + + @Verify + public void verify() { + logger.info("======== VERIFYING VECTOR COLLECTION ========="); + logger.info("Collection size is: " + counter.get()); + logger.info("Test duration (minutes): " + (System.currentTimeMillis() - start)/60_000); + int verifySize = counter.get() / 100; + int[] verifyKeys = RANDOM.ints(verifySize, 0, counter.get()).toArray(); + for (int key : verifyKeys) { + VectorDocument value = collection.getAsync(key).toCompletableFuture().join(); + assertNotNull("No value found for the key: " + key, value); + var vector = ((SingleIndexVectorValues)value.getVectors()).vector(); + logger.info(key + " - " + Arrays.toString(vector)); + } + } + + public class ThreadState extends BaseThreadState { + } + + private float[] generateVector(int length, int minValue, int maxValue) { + assert minValue >= 0 : "negative min value is not supported"; + Supplier generator = generateOnlyIntCoordinate ? () -> (float) RANDOM.nextInt(minValue, maxValue) : () -> RANDOM.nextFloat(minValue, maxValue); + float[] value = new float[length]; + range(0, length).forEach(i -> value[i] = generator.get()); + return value; + } + +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java new file mode 100644 index 0000000000..8766daaa36 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java @@ -0,0 +1,143 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.config.vector.Metric; +import com.hazelcast.config.vector.VectorCollectionConfig; +import com.hazelcast.config.vector.VectorIndexConfig; +import com.hazelcast.simulator.hz.HazelcastTest; +import com.hazelcast.simulator.test.BaseThreadState; +import com.hazelcast.simulator.test.annotations.AfterRun; +import com.hazelcast.simulator.test.annotations.Setup; +import com.hazelcast.simulator.test.annotations.TimeStep; +import com.hazelcast.vector.SearchOptions; +import com.hazelcast.vector.SearchOptionsBuilder; +import com.hazelcast.vector.VectorCollection; +import com.hazelcast.vector.VectorDocument; +import com.hazelcast.vector.VectorValues; + +import java.util.HashMap; +import java.util.Map; +import java.util.concurrent.TimeUnit; +import java.util.concurrent.atomic.AtomicInteger; + +import static java.lang.String.format; + +public class VectorCollectionSearchDatasetTest extends HazelcastTest { + + public String datasetUrl; + + public String workingDirectory; + + // common parameters + public int numberOfSearchIterations = Integer.MAX_VALUE; + + public int loadFirst = Integer.MAX_VALUE; + + // graph parameters + public String metric = "COSINE"; + + public int maxDegree = 40; + + public int efConstruction = 50; + + // search parameters + + public int limit = 1; + + // inner test parameters + + private static final String collectionName = "performance-collection"; + + private static final int PUT_BATCH_SIZE = 10_000; + private VectorCollection collection; + + private final AtomicInteger searchCounter = new AtomicInteger(0); + + private final AtomicInteger putCounter = new AtomicInteger(0); + private float[][] testDataset; + + @Setup + public void setup() { + DatasetReader reader = DatasetReader.create(datasetUrl, workingDirectory); + var size = Math.min(reader.getSize(), loadFirst); + int dimension = reader.getDimension(); + assert dimension == reader.getTestDatasetDimension() : "dataset dimension does not correspond to query vector dimension"; + testDataset = reader.getTestDataset(); + numberOfSearchIterations = Math.min(numberOfSearchIterations, testDataset.length); + + collection = VectorCollection.getCollection( + targetInstance, + new VectorCollectionConfig(collectionName) + .addVectorIndexConfig( + new VectorIndexConfig() + .setMetric(Metric.valueOf(metric)) + .setDimension(dimension) + .setMaxDegree(maxDegree) + .setEfConstruction(efConstruction) + ) + ); + + var start = System.currentTimeMillis(); + + Map> buffer = new HashMap<>(); + int index; + logger.info("Start loading data..."); + while ((index = putCounter.getAndIncrement()) < size) { + buffer.put(index, VectorDocument.of(index, VectorValues.of(reader.getTrainVector(index)))); + if (buffer.size() % PUT_BATCH_SIZE == 0) { + var blockStart = System.currentTimeMillis(); + collection.putAllAsync(buffer).toCompletableFuture().join(); + logger.info( + format( + "Uploaded %s vectors from %s. Block size: %s. Delta (m): %s. Total time (m): %s", + index, + size, + buffer.size(), + TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - blockStart), + TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - start) + ) + ); + buffer.clear(); + } + } + if (!buffer.isEmpty()) { + collection.putAllAsync(buffer).toCompletableFuture().join(); + logger.info(format("Uploaded vectors. Last block size: %s.", buffer.size())); + buffer.clear(); + } + var startCleanup = System.currentTimeMillis(); + collection.optimizeAsync().toCompletableFuture().join(); + + logger.info("Collection size: " + size); + logger.info("Collection dimension: " + reader.getDimension()); + logger.info("Cleanup time(ms): " + (System.currentTimeMillis() - startCleanup)); + logger.info("Index build time(m): " + TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - start)); + } + + @TimeStep(prob = 1) + public void search(ThreadState state) { + var iteration = searchCounter.incrementAndGet(); + if (iteration >= numberOfSearchIterations) { + testContext.stop(); + return; + } + var vector = testDataset[iteration]; + SearchOptions options = new SearchOptionsBuilder().vector(vector).includePayload().limit(limit).build(); + var result = collection.searchAsync(options).toCompletableFuture().join(); + + var score = result.results().next().getScore(); + ScoreMetrics.set((int) (score * 100)); + logger.info("Found score: " + result.results().next().getScore()); + } + + @AfterRun + public void afterRun() { + logger.info("Number of search iteration: " + searchCounter.get()); + logger.info("Min score: " + ScoreMetrics.getMin()); + logger.info("Max score: " + ScoreMetrics.getMax()); + logger.info("Mean score: " + ScoreMetrics.getMean()); + logger.info("Percent lower then 0.98: " + ScoreMetrics.getPercentLowerThen(98)); + } + + public static class ThreadState extends BaseThreadState { + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java new file mode 100644 index 0000000000..c4925d31d2 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java @@ -0,0 +1,50 @@ +package com.hazelcast.simulator.tests.vector.readers; + +import com.hazelcast.simulator.tests.vector.DatasetReader; +import io.jhdf.HdfFile; +import io.jhdf.api.Dataset; + +public class HDF5DatasetReader extends DatasetReader { + + private static int BULK_READER_SIZE = 50_000; + + public HDF5DatasetReader(String url, String directory) { + super(url, directory); + } + + @Override + protected void preprocessDatasetFile() { + + } + + @Override + protected void parseTrainDataset() { + trainDataset = getDataset("train"); + size = trainDataset.length; + dimension = trainDataset[0].length; + } + + @Override + protected void parseTestDataset() { + testDataset = getDataset("test"); + } + + private float[][] getDataset(String datasetName) { + try (HdfFile hdfFile = new HdfFile(downloadedFile.toPath())) { + var datasetNode = hdfFile.getChildren().get(datasetName); + Dataset dataset = hdfFile.getDatasetByPath(datasetNode.getPath()); + var dimension = dataset.getDimensions()[1]; + var size = dataset.getDimensions()[0]; + + float[][] array = new float[size][dimension]; + + for (int i = 0; i < size; i += BULK_READER_SIZE) { + int length = Math.min(BULK_READER_SIZE, size - i); + float[][] buffer = (float[][]) dataset.getData(new long[] {i, 0}, new int[]{length, dimension}); + System.arraycopy(buffer, 0, array, i, buffer.length); + } + return array; + } + } + +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java new file mode 100644 index 0000000000..16f7360f1d --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java @@ -0,0 +1,86 @@ +package com.hazelcast.simulator.tests.vector.readers; + +import com.google.gson.JsonArray; +import com.google.gson.JsonParser; +import com.hazelcast.simulator.tests.vector.DatasetReader; +import org.apache.commons.io.FileUtils; +import org.jetbrains.bio.npy.NpyArray; +import org.jetbrains.bio.npy.NpyFile; + +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; + +public class NpyArchiveDatasetReader extends DatasetReader { + + private Path trainDatasetFilename; + private Path testDatesetFilename; + + private float[] trainDatasetPlain; + + public NpyArchiveDatasetReader(String url, String directory) { + super(url, directory); + } + + @Override + protected void preprocessDatasetFile() { + this.trainDatasetFilename = Path.of(workingDirectory.toString(), "vectors.npy"); + this.testDatesetFilename = Path.of(workingDirectory.toString(), "tests.jsonl"); + + if (!trainDatasetFilename.toFile().exists()) { + unpack(); + } + logger.info("Unpacking finished."); + } + + private void unpack() { + try { + TarExtractor.extractTarGZ(new FileInputStream(downloadedFile), workingDirectory); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + protected void parseTrainDataset() { + NpyArray read = NpyFile.read(trainDatasetFilename, Integer.MAX_VALUE); + var shape = read.getShape(); + size = shape[0]; + dimension = shape[1]; + trainDatasetPlain = read.asFloatArray(); + } + + @Override + protected void parseTestDataset() { + try { + var parser = new JsonParser(); + List queryList = FileUtils.readLines(testDatesetFilename.toFile(), Charset.defaultCharset()); + testDataset = new float[queryList.size()][dimension]; + for (int i = 0; i < queryList.size(); i++) { + var jsonArray = parser.parse(queryList.get(i)).getAsJsonObject().getAsJsonArray("query"); + testDataset[i] = convert(jsonArray); + } + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private float[] convert(JsonArray array) { + var result = new float[array.size()]; + for (int i = 0; i < array.size(); i++) { + result[i] = array.get(i).getAsFloat(); + } + return result; + } + + @Override + public float[] getTrainVector(int index) { + if (index >= size) { + throw new RuntimeException("invalid index"); + } + return Arrays.copyOfRange(trainDatasetPlain, index * dimension, (index + 1) * dimension); + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/TarExtractor.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/TarExtractor.java new file mode 100644 index 0000000000..da016919c7 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/TarExtractor.java @@ -0,0 +1,51 @@ +package com.hazelcast.simulator.tests.vector.readers; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; + +public class TarExtractor { + + private static final int BUFFER_SIZE = 100_000; + + public static void extractTarGZ(InputStream in, Path directory) throws IOException { + GzipCompressorInputStream gzipIn = new GzipCompressorInputStream(in); + try (TarArchiveInputStream tarIn = new TarArchiveInputStream(gzipIn)) { + TarArchiveEntry entry; + + while ((entry = tarIn.getNextEntry()) != null) { + // If the entry is a directory, create the directory. + if (entry.isDirectory()) { + File f = new File(entry.getName()); + boolean created = f.mkdir(); + if (!created) { + System.out.printf("Unable to create directory '%s', during extraction of archive contents.\n", + f.getAbsolutePath()); + } + } else { + int count; + byte[] data = new byte[BUFFER_SIZE]; + var output = Path.of(directory.toString(), entry.getName()); + FileOutputStream fos = new FileOutputStream(output.toFile(), false); + try (BufferedOutputStream dest = new BufferedOutputStream(fos, BUFFER_SIZE)) { + while ((count = tarIn.read(data, 0, BUFFER_SIZE)) != -1) { + dest.write(data, 0, count); + } + } + } + } + + System.out.println("Untar completed successfully!"); + } + } +} + + + diff --git a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java new file mode 100644 index 0000000000..f4636b7a3b --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java @@ -0,0 +1,44 @@ +package com.hazelcast.simulator.utils; + +import com.hazelcast.simulator.tests.vector.DatasetReader; +import org.junit.Test; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +public class DatasetReaderTest { + + //String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/random_keywords_1m.tgz"; + //String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/hnm.tgz"; // work + //String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/yandex_t2i_gt_100k.tgz"; // not vectors in archive + //String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/laion-small-clip.tgz"; // broken? + + String workingDirectory = "/Users/oshultseva/Downloads/dataset_output/"; + @Test + public void npyArchive() { + String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz"; + + DatasetReader reader = DatasetReader.create(url, workingDirectory); + + assertEquals(975_000, reader.getSize()); + assertEquals(1536, reader.getDimension()); + assertEquals(1536, reader.getTestDatasetDimension()); + + assertNotNull(reader.getTrainVector(1234)); + assertNotNull(reader.getTestDataset()); + } + + @Test + public void hdf5() { + String url = "http://ann-benchmarks.com/gist-960-euclidean.hdf5"; + + DatasetReader reader = DatasetReader.create(url, workingDirectory); + + assertEquals(1_000_000, reader.getSize()); + assertEquals(960, reader.getDimension()); + assertEquals(960, reader.getTestDatasetDimension()); + + assertNotNull(reader.getTrainVector(1234)); + assertNotNull(reader.getTestDataset()); + } +} diff --git a/java/pom.xml b/java/pom.xml index 3b318bc3cd..8884a8feea 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -73,13 +73,13 @@ 3.1.2 3.3.0 - 3.10 + 3.3.0.603 0.8.10 4.9.10 2.16.1 - 2.16.1 + 3.0.1 1.9.4 3.6.0 diff --git a/vector-search-simulator/README.md b/vector-search-simulator/README.md new file mode 100644 index 0000000000..171b62e0e5 --- /dev/null +++ b/vector-search-simulator/README.md @@ -0,0 +1,3 @@ +# Vector Search Simulator + +vector-test and vector-test-2 are two different environments with the same tests. \ No newline at end of file diff --git a/vector-search-simulator/vector-test-2/README.md b/vector-search-simulator/vector-test-2/README.md new file mode 100644 index 0000000000..d7758a3c4f --- /dev/null +++ b/vector-search-simulator/vector-test-2/README.md @@ -0,0 +1,36 @@ +The purpose of this simulator test is to test Hazelcast tiered-storage. + +To modify the environment, edit the `inventory_plan.yaml`. + +To create the environment. +```shell +inventory apply +``` + +To get an overview of the available instances: +```shell +cat inventory.yaml +``` + +Install the simulator and Java on the environment +```shell +inventory install java +inventory install simulator +``` + +If you want to get the best performance for your environment +```shell +inventory tune +``` + +Modify the tests by editing the `tests.yaml` file. + +To run the tests +```shell +perftest run +``` + +To destroy the environment. +```shell +inventory destroy +``` \ No newline at end of file diff --git a/vector-search-simulator/vector-test-2/ansible.cfg b/vector-search-simulator/vector-test-2/ansible.cfg new file mode 100644 index 0000000000..1d761c5638 --- /dev/null +++ b/vector-search-simulator/vector-test-2/ansible.cfg @@ -0,0 +1,10 @@ +[defaults] +host_key_checking = False +inventory = inventory.yaml + + +# For nice formatted output +stdout_callback = yaml +bin_ansible_callbacks = True + +interpreter_python=auto_silent diff --git a/vector-search-simulator/vector-test-2/aws/auto_mount.py b/vector-search-simulator/vector-test-2/aws/auto_mount.py new file mode 100755 index 0000000000..db3e7886c2 --- /dev/null +++ b/vector-search-simulator/vector-test-2/aws/auto_mount.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import subprocess +import threading + +filesystem = "ext3" + + +def list_unmounted_partitions(): + cmd = 'lsblk --noheadings --raw -o NAME,MOUNTPOINT' + lines = subprocess.check_output(cmd, shell=True, text=True).strip().splitlines() + + devices = [] + partitions = [] + + for line in lines: + record = line.split() + name = record[0] + + if not name.startswith("nvme"): + continue + + if "p" in name: + partitions.append(name) + elif len(record) == 1: + devices.append(name) + + for dev in devices: + for partition in partitions: + if partition.startswith(dev): + devices.remove(dev) + break + + return devices + + +def format_and_mount(dev): + cmd = f'sudo mkfs.{filesystem} /dev/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo mkdir -p /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo mount -t {filesystem} /dev/{dev} /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo chown ubuntu /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + +unmounted = list_unmounted_partitions() + +print(unmounted) + +jobs = [] +for dev in unmounted: + job = threading.Thread(target=format_and_mount(dev)) + jobs.append(job) + job.start() + +for job in jobs: + job.join() diff --git a/vector-search-simulator/vector-test-2/aws/main.tf b/vector-search-simulator/vector-test-2/aws/main.tf new file mode 100644 index 0000000000..bbc6077b04 --- /dev/null +++ b/vector-search-simulator/vector-test-2/aws/main.tf @@ -0,0 +1,378 @@ + +locals { + settings = yamldecode(file("../inventory_plan.yaml")) + private_key = file("../${local.settings.keypair.private_key}") + public_key = file("../${local.settings.keypair.public_key}") +} + +provider "aws" { + profile = "oshultseva" + region = local.settings.region +} + +resource "aws_default_vpc" "vpc" { + tags = { + Name = "Default VPC" + } +} + +#resource "aws_vpc" "prod-vpc" { +# cidr_block = "10.0.0.0/16" +# enable_dns_support = "true" #gives you an internal domain name +# enable_dns_hostnames = "true" #gives you an internal host name +# enable_classiclink = "false" +# instance_tenancy = "default" +# +# tags = { +# Name = "prod-vpc" +# } +#} + +#resource "aws_internet_gateway" "my_vpc_igw" { +# vpc_id = local.settings.vpc_id +# +# tags = { +# Name = "My VPC - Internet Gateway" +# } +#} + +resource "aws_key_pair" "keypair" { + key_name = "simulator-keypair-${local.settings.basename}" + public_key = local.public_key +} + +resource "aws_subnet" "subnet" { + vpc_id = local.settings.vpc_id + cidr_block = local.settings.cidr_block + availability_zone = local.settings.availability_zone + map_public_ip_on_launch = true + tags = { + Name = "Simulator Public Subnet ${local.settings.basename}" + } +} + +resource "aws_route_table" "route_table" { + vpc_id = local.settings.vpc_id + route { + cidr_block = "0.0.0.0/0" + gateway_id = local.settings.internet_gateway_id + } + + tags = { + Name = "Simulator Public Subnet Route Table ${local.settings.basename}" + } +} + +resource "aws_route_table_association" "route_table_association" { + subnet_id = aws_subnet.subnet.id + route_table_id = aws_route_table.route_table.id +} + +# ========== nodes ========================== + +# Currently there is a single placement group defined for all nodes/load generators. +# If you want to use placement_group, uncomment the commented out 'placementgroup' +# configuration in the nodes and loadgenerators sections. +resource "aws_placement_group" "cluster_placement_group" { + name = "simulator-placement-group-${local.settings.basename}" + strategy = "cluster" +} + +resource "aws_security_group" "node-sg" { + name = "simulator-security-group-node-${local.settings.basename}" + description = "Security group for the node" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator Node Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + + ingress { + description = "iperf3_udp" + from_port = 3000 + to_port = 3000 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "iperf3_tcp" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 5701 + to_port = 5801 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 9000 + to_port = 9001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast-tpc" + from_port = 11000 + to_port = 12000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "nodes" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.nodes.ami + instance_type = local.settings.nodes.instance_type + count = local.settings.nodes.count + availability_zone = local.settings.availability_zone + #placement_group = aws_placement_group.cluster_placement_group.name + vpc_security_group_ids = [ aws_security_group.node-sg.id ] + subnet_id = aws_subnet.subnet.id + tenancy = local.settings.nodes.tenancy + + tags = { + Name = "Simulator Node ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.nodes.user + } + + connection { + type = "ssh" + user = local.settings.nodes.user + private_key = local.private_key + host = self.public_ip + } + + provisioner "file" { + source = "auto_mount.py" + destination = "/tmp/auto_mount.py" + } + + provisioner "remote-exec" { + inline = [ + "python3 /tmp/auto_mount.py" + ] + } +} + +output "nodes" { + value = [aws_instance.nodes.*] +} + +# ========== load generators ========================== + +resource "aws_security_group" "loadgenerator-sg" { + name = "simulator-security-group-loadgenerator-${local.settings.basename}" + description = "Security group for the loadgenerator" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator Load Balancer Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + + ingress { + description = "iperf3_udp" + from_port = 3000 + to_port = 3000 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "iperf3_tcp" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 5701 + to_port = 5801 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 9000 + to_port = 9001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "loadgenerators" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.loadgenerators.ami + instance_type = local.settings.loadgenerators.instance_type + count = local.settings.loadgenerators.count + subnet_id = aws_subnet.subnet.id + availability_zone = local.settings.availability_zone + #placement_group = aws_placement_group.cluster_placement_group.name + vpc_security_group_ids = [ aws_security_group.loadgenerator-sg.id ] + tenancy = local.settings.loadgenerators.tenancy + tags = { + Name = "Simulator Load Generator ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.loadgenerators.user + } + + connection { + type = "ssh" + user = local.settings.loadgenerators.user + private_key = local.private_key + host = self.public_ip + } + + provisioner "file" { + source = "auto_mount.py" + destination = "/tmp/auto_mount.py" + } + + provisioner "remote-exec" { + inline = [ + "python3 /tmp/auto_mount.py" + ] + } +} + +output "loadgenerators" { + value = [aws_instance.loadgenerators.*] +} + +# ========== management center ========================== + +resource "aws_security_group" "mc-sg" { + name = "simulator-security-group-mc-${local.settings.basename}" + description = "Security group for the Management Center" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator MC Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 8080 + to_port = 8080 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 8443 + to_port = 8443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "mc" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.mc.ami + instance_type = local.settings.mc.instance_type + count = local.settings.mc.count + subnet_id = aws_subnet.subnet.id + availability_zone = local.settings.availability_zone + vpc_security_group_ids = [ aws_security_group.mc-sg.id ] + + tags = { + Name = "Simulator MC ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.mc.user + } + + connection { + type = "ssh" + user = local.settings.mc.user + private_key = file("../${local.settings.keypair.private_key}") + host = self.public_ip + } + + provisioner "remote-exec" { + inline = [ + "wget -q https://repository.hazelcast.com/download/management-center/hazelcast-management-center-5.0.tar.gz", + "tar -xzvf hazelcast-management-center-5.0.tar.gz", + "while [ ! -f /var/lib/cloud/instance/boot-finished ]; do echo 'Waiting for cloud-init...'; sleep 1; done", + "sudo apt-get -y update", + "sudo apt-get -y install openjdk-11-jdk", + "nohup hazelcast-management-center-5.0/bin/start.sh > mc.out 2>&1 &", + "sleep 2" + ] + } +} + +output "mc" { + value = [aws_instance.mc.*] +} + + + + + + diff --git a/vector-search-simulator/vector-test-2/hazelcast.xml b/vector-search-simulator/vector-test-2/hazelcast.xml new file mode 100644 index 0000000000..0da5668ba5 --- /dev/null +++ b/vector-search-simulator/vector-test-2/hazelcast.xml @@ -0,0 +1,40 @@ + + + + workers + + + + + 5701 + + + + + + + + + + + false + log4j2 + 8 + 16 + + + + + + + + + + + + + + diff --git a/vector-search-simulator/vector-test-2/inventory.yaml b/vector-search-simulator/vector-test-2/inventory.yaml new file mode 100644 index 0000000000..165743a381 --- /dev/null +++ b/vector-search-simulator/vector-test-2/inventory.yaml @@ -0,0 +1,12 @@ +loadgenerators: + hosts: + 3.124.1.127: + ansible_ssh_private_key_file: key + ansible_user: ubuntu + private_ip: 10.0.195.59 +nodes: + hosts: + 18.199.96.216: + ansible_ssh_private_key_file: key + ansible_user: ubuntu + private_ip: 10.0.195.213 diff --git a/vector-search-simulator/vector-test-2/inventory_plan.yaml b/vector-search-simulator/vector-test-2/inventory_plan.yaml new file mode 100644 index 0000000000..b19b789da9 --- /dev/null +++ b/vector-search-simulator/vector-test-2/inventory_plan.yaml @@ -0,0 +1,53 @@ +provisioner: terraform +terraform_plan: aws +# Used for naming resources; give it some unique name specific to a set of benchmarks +basename: oshultseva-lacow +# Enter something here that identifies you. +owner: oshultseva +placement_group_name: None +region: eu-central-1 +availability_zone: eu-central-1a +vpc_id: vpc-002b5a4e5f8b8ece2 +internet_gateway_id: igw-02b8fe3ab75871205 +# Change the '20' to a different octet to prevent running into conflicts. +cidr_block: 10.0.195.0/24 + +keypair: + public_key: key.pub + private_key: key + +nodes: + count: 1 + instance_type: i3en.2xlarge + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null + +loadgenerators: + count: 1 + instance_type: i3en.2xlarge + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null + +mc: + instance_type: c5.4xlarge + count: 0 + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null diff --git a/vector-search-simulator/vector-test-2/setup b/vector-search-simulator/vector-test-2/setup new file mode 100755 index 0000000000..39245047da --- /dev/null +++ b/vector-search-simulator/vector-test-2/setup @@ -0,0 +1,14 @@ +#!/bin/bash + +# This script will setup the environment + +set -e + +inventory apply + +echo "Waiting for instances to start" +sleep 60 + +inventory install java --url https://download.java.net/java/GA/jdk21/fd2272bbf8e04c3dbaee13770090416c/35/GPL/openjdk-21_linux-x64_bin.tar.gz + +inventory install simulator \ No newline at end of file diff --git a/vector-search-simulator/vector-test-2/teardown b/vector-search-simulator/vector-test-2/teardown new file mode 100755 index 0000000000..21db8c6cc3 --- /dev/null +++ b/vector-search-simulator/vector-test-2/teardown @@ -0,0 +1,7 @@ +#!/bin/bash + +# This script will teardown the environment + +set -e + +inventory destroy \ No newline at end of file diff --git a/vector-search-simulator/vector-test-2/tests.yaml b/vector-search-simulator/vector-test-2/tests.yaml new file mode 100644 index 0000000000..dec3eab69d --- /dev/null +++ b/vector-search-simulator/vector-test-2/tests.yaml @@ -0,0 +1,43 @@ +- name: map_tiered + duration: 600s + repetitions: 1 + clients: 2 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.4-SNAPSHOT + client_args: > + -Xms3g + -Xmx3g + member_args: > + -Xms3g + -Xmx3g + -Dhazelcast.diagnostics.enabled=true + -Dhazelcast.diagnostics.metric.level=info + -Dhazelcast.diagnostics.invocation.sample.period.seconds=30 + -Dhazelcast.diagnostics.pending.invocations.period.seconds=30 + -Dhazelcast.diagnostics.slowoperations.period.seconds=30 + -Dhazelcast.diagnostics.storeLatency.period.seconds=60 + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: + test: + class: com.hazelcast.simulator.hz.map.LongStringMapTest + name: map_tiered + threadCount: 40 + getProb: 0 + putProb: 0 + setProb: 1 + keyDomain: 100_000_000 + minValueLength: 1_000 + maxValueLength: 1_000 + fillOnPrepare: False diff --git a/vector-search-simulator/vector-test-2/vectors-datasets-tests-v2.yaml b/vector-search-simulator/vector-test-2/vectors-datasets-tests-v2.yaml new file mode 100644 index 0000000000..9b86c72261 --- /dev/null +++ b/vector-search-simulator/vector-test-2/vectors-datasets-tests-v2.yaml @@ -0,0 +1,52 @@ +- name: read_only + duration: 30m + repetitions: 1 + clients: 1 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.5.0-SNAPSHOT + client_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + member_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: LICENSE + test: + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector + threadCount: 1 +# ratePerSecond: 5_000 +# interval: 100us + logRateMs: 600_000 + searchProb: 1 +# numberOfIterations: 1_000_000 + # generated data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 32_123 + # index params + metric: COSINE + maxDegree: 64 + efConstruction: 512 + # search param + limit: 1 + diff --git a/vector-search-simulator/vector-test/README.md b/vector-search-simulator/vector-test/README.md new file mode 100644 index 0000000000..d7758a3c4f --- /dev/null +++ b/vector-search-simulator/vector-test/README.md @@ -0,0 +1,36 @@ +The purpose of this simulator test is to test Hazelcast tiered-storage. + +To modify the environment, edit the `inventory_plan.yaml`. + +To create the environment. +```shell +inventory apply +``` + +To get an overview of the available instances: +```shell +cat inventory.yaml +``` + +Install the simulator and Java on the environment +```shell +inventory install java +inventory install simulator +``` + +If you want to get the best performance for your environment +```shell +inventory tune +``` + +Modify the tests by editing the `tests.yaml` file. + +To run the tests +```shell +perftest run +``` + +To destroy the environment. +```shell +inventory destroy +``` \ No newline at end of file diff --git a/vector-search-simulator/vector-test/aws/auto_mount.py b/vector-search-simulator/vector-test/aws/auto_mount.py new file mode 100755 index 0000000000..db3e7886c2 --- /dev/null +++ b/vector-search-simulator/vector-test/aws/auto_mount.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import subprocess +import threading + +filesystem = "ext3" + + +def list_unmounted_partitions(): + cmd = 'lsblk --noheadings --raw -o NAME,MOUNTPOINT' + lines = subprocess.check_output(cmd, shell=True, text=True).strip().splitlines() + + devices = [] + partitions = [] + + for line in lines: + record = line.split() + name = record[0] + + if not name.startswith("nvme"): + continue + + if "p" in name: + partitions.append(name) + elif len(record) == 1: + devices.append(name) + + for dev in devices: + for partition in partitions: + if partition.startswith(dev): + devices.remove(dev) + break + + return devices + + +def format_and_mount(dev): + cmd = f'sudo mkfs.{filesystem} /dev/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo mkdir -p /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo mount -t {filesystem} /dev/{dev} /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo chown ubuntu /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + +unmounted = list_unmounted_partitions() + +print(unmounted) + +jobs = [] +for dev in unmounted: + job = threading.Thread(target=format_and_mount(dev)) + jobs.append(job) + job.start() + +for job in jobs: + job.join() diff --git a/vector-search-simulator/vector-test/aws/main.tf b/vector-search-simulator/vector-test/aws/main.tf new file mode 100644 index 0000000000..1a01ca5910 --- /dev/null +++ b/vector-search-simulator/vector-test/aws/main.tf @@ -0,0 +1,378 @@ + +locals { + settings = yamldecode(file("../inventory_plan.yaml")) + private_key = file("../${local.settings.keypair.private_key}") + public_key = file("../${local.settings.keypair.public_key}") +} + +provider "aws" { + profile = "oshultseva" + region = local.settings.region +} + +resource "aws_default_vpc" "vpc" { + tags = { + Name = "Default VPC" + } +} + +#resource "aws_vpc" "prod-vpc" { +# cidr_block = "10.0.0.0/16" +# enable_dns_support = "true" #gives you an internal domain name +# enable_dns_hostnames = "true" #gives you an internal host name +# enable_classiclink = "false" +# instance_tenancy = "default" +# +# tags = { +# Name = "prod-vpc" +# } +#} + +#resource "aws_internet_gateway" "my_vpc_igw" { +# vpc_id = local.settings.vpc_id +# +# tags = { +# Name = "My VPC - Internet Gateway" +# } +#} + +resource "aws_key_pair" "keypair" { + key_name = "simulator-keypair-${local.settings.basename}" + public_key = local.public_key +} + +resource "aws_subnet" "subnet" { + vpc_id = local.settings.vpc_id + cidr_block = local.settings.cidr_block + availability_zone = local.settings.availability_zone + map_public_ip_on_launch = true + tags = { + Name = "Simulator Public Subnet ${local.settings.basename}" + } +} + +resource "aws_route_table" "route_table" { + vpc_id = local.settings.vpc_id + route { + cidr_block = "0.0.0.0/0" + gateway_id = local.settings.internet_gateway_id + } + + tags = { + Name = "Simulator Public Subnet Route Table ${local.settings.basename}" + } +} + +resource "aws_route_table_association" "route_table_association" { + subnet_id = aws_subnet.subnet.id + route_table_id = aws_route_table.route_table.id +} + +# ========== nodes ========================== + +# Currently there is a single placement group defined for all nodes/load generators. +# If you want to use placement_group, uncomment the commented out 'placementgroup' +# configuration in the nodes and loadgenerators sections. +resource "aws_placement_group" "cluster_placement_group" { + name = "simulator-placement-group-${local.settings.basename}" + strategy = "cluster" +} + +resource "aws_security_group" "node-sg" { + name = "simulator-security-group-node-${local.settings.basename}" + description = "Security group for the node" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator Node Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + + ingress { + description = "iperf3_udp" + from_port = 3000 + to_port = 3000 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "iperf3_tcp" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 5701 + to_port = 5801 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 9000 + to_port = 9001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast-tpc" + from_port = 11000 + to_port = 12000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "nodes" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.nodes.ami + instance_type = local.settings.nodes.instance_type + count = local.settings.nodes.count + availability_zone = local.settings.availability_zone + #placement_group = aws_placement_group.cluster_placement_group.name + vpc_security_group_ids = [ aws_security_group.node-sg.id ] + subnet_id = aws_subnet.subnet.id + tenancy = local.settings.nodes.tenancy + + tags = { + Name = "Simulator Node ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.nodes.user + } + + connection { + type = "ssh" + user = local.settings.nodes.user + private_key = local.private_key + host = self.public_ip + } + + provisioner "file" { + source = "auto_mount.py" + destination = "/tmp/auto_mount.py" + } + + provisioner "remote-exec" { + inline = [ + "python3 /tmp/auto_mount.py" + ] + } +} + +output "nodes" { + value = [aws_instance.nodes.*] +} + +# ========== load generators ========================== + +resource "aws_security_group" "loadgenerator-sg" { + name = "simulator-security-group-loadgenerator-${local.settings.basename}" + description = "Security group for the loadgenerator" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator Load Balancer Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + + ingress { + description = "iperf3_udp" + from_port = 3000 + to_port = 3000 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "iperf3_tcp" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 5701 + to_port = 5801 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 9000 + to_port = 9001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "loadgenerators" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.loadgenerators.ami + instance_type = local.settings.loadgenerators.instance_type + count = local.settings.loadgenerators.count + subnet_id = aws_subnet.subnet.id + availability_zone = local.settings.availability_zone + #placement_group = aws_placement_group.cluster_placement_group.name + vpc_security_group_ids = [ aws_security_group.loadgenerator-sg.id ] + tenancy = local.settings.loadgenerators.tenancy + tags = { + Name = "Simulator Load Generator ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.loadgenerators.user + } + + connection { + type = "ssh" + user = local.settings.loadgenerators.user + private_key = local.private_key + host = self.public_ip + } + + provisioner "file" { + source = "auto_mount.py" + destination = "/tmp/auto_mount.py" + } + + provisioner "remote-exec" { + inline = [ + "python3 /tmp/auto_mount.py" + ] + } +} + +output "loadgenerators" { + value = [aws_instance.loadgenerators.*] +} + +# ========== management center ========================== + +resource "aws_security_group" "mc-sg" { + name = "simulator-security-group-mc-${local.settings.basename}" + description = "Security group for the Management Center" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator MC Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 8080 + to_port = 8080 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 8443 + to_port = 8443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "mc" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.mc.ami + instance_type = local.settings.mc.instance_type + count = local.settings.mc.count + subnet_id = aws_subnet.subnet.id + availability_zone = local.settings.availability_zone + vpc_security_group_ids = [ aws_security_group.mc-sg.id ] + + tags = { + Name = "Simulator MC ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.mc.user + } + + connection { + type = "ssh" + user = local.settings.mc.user + private_key = file("../${local.settings.keypair.private_key}") + host = self.public_ip + } + + provisioner "remote-exec" { + inline = [ + "wget -q https://repository.hazelcast.com/download/management-center/hazelcast-management-center-5.0.tar.gz", + "tar -xzvf hazelcast-management-center-5.0.tar.gz", + "while [ ! -f /var/lib/cloud/instance/boot-finished ]; do echo 'Waiting for cloud-init...'; sleep 1; done", + "sudo apt-get -y update", + "sudo apt-get -y install openjdk-11-jdk", + "nohup hazelcast-management-center-5.0/bin/start.sh > mc.out 2>&1 &", + "sleep 2" + ] + } +} + +output "mc" { + value = [aws_instance.mc.*] +} + + + + + + diff --git a/vector-search-simulator/vector-test/hazelcast.xml b/vector-search-simulator/vector-test/hazelcast.xml new file mode 100644 index 0000000000..a47d3273e2 --- /dev/null +++ b/vector-search-simulator/vector-test/hazelcast.xml @@ -0,0 +1,40 @@ + + + + workers + + + + + 5701 + + + + + + + + + + + false + log4j2 + 16 + + + + + + + + + + + + + + + diff --git a/vector-search-simulator/vector-test/inventory.yaml b/vector-search-simulator/vector-test/inventory.yaml new file mode 100644 index 0000000000..bc54e4615d --- /dev/null +++ b/vector-search-simulator/vector-test/inventory.yaml @@ -0,0 +1,12 @@ +loadgenerators: + hosts: + 3.68.98.252: + ansible_ssh_private_key_file: key + ansible_user: ubuntu + private_ip: 10.0.194.113 +nodes: + hosts: + 3.70.95.192: + ansible_ssh_private_key_file: key + ansible_user: ubuntu + private_ip: 10.0.194.248 diff --git a/vector-search-simulator/vector-test/inventory_plan.yaml b/vector-search-simulator/vector-test/inventory_plan.yaml new file mode 100644 index 0000000000..d177969a28 --- /dev/null +++ b/vector-search-simulator/vector-test/inventory_plan.yaml @@ -0,0 +1,53 @@ +provisioner: terraform +terraform_plan: aws +# Used for naming resources; give it some unique name specific to a set of benchmarks +basename: oshultseva-hcbsg +# Enter something here that identifies you. +owner: oshultseva +placement_group_name: None +region: eu-central-1 +availability_zone: eu-central-1a +vpc_id: vpc-002b5a4e5f8b8ece2 +internet_gateway_id: igw-02b8fe3ab75871205 +# Change the '20' to a different octet to prevent running into conflicts. +cidr_block: 10.0.194.0/24 + +keypair: + public_key: key.pub + private_key: key + +nodes: + count: 1 + instance_type: i3en.2xlarge + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null + +loadgenerators: + count: 1 + instance_type: i3en.2xlarge + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null + +mc: + instance_type: c5.4xlarge + count: 0 + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null diff --git a/vector-search-simulator/vector-test/setup b/vector-search-simulator/vector-test/setup new file mode 100755 index 0000000000..39245047da --- /dev/null +++ b/vector-search-simulator/vector-test/setup @@ -0,0 +1,14 @@ +#!/bin/bash + +# This script will setup the environment + +set -e + +inventory apply + +echo "Waiting for instances to start" +sleep 60 + +inventory install java --url https://download.java.net/java/GA/jdk21/fd2272bbf8e04c3dbaee13770090416c/35/GPL/openjdk-21_linux-x64_bin.tar.gz + +inventory install simulator \ No newline at end of file diff --git a/vector-search-simulator/vector-test/teardown b/vector-search-simulator/vector-test/teardown new file mode 100755 index 0000000000..21db8c6cc3 --- /dev/null +++ b/vector-search-simulator/vector-test/teardown @@ -0,0 +1,7 @@ +#!/bin/bash + +# This script will teardown the environment + +set -e + +inventory destroy \ No newline at end of file diff --git a/vector-search-simulator/vector-test/vectors-datasets-tests-v2.yaml b/vector-search-simulator/vector-test/vectors-datasets-tests-v2.yaml new file mode 100644 index 0000000000..89a62d12f7 --- /dev/null +++ b/vector-search-simulator/vector-test/vectors-datasets-tests-v2.yaml @@ -0,0 +1,51 @@ +- name: read_only + duration: 30m + repetitions: 1 + clients: 1 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.5.0-SNAPSHOT + client_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + member_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: LICENSE + test: + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector + threadCount: 1 + # ratePerSecond: 5_000 + # interval: 100us + logRateMs: 60_000 + searchProb: 1 + # numberOfIterations: 1_000_000 + # data parameters + datasetUrl: http://ann-benchmarks.com/gist-960-euclidean.hdf5 + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_123 + # index params + metric: EUCLIDEAN + maxDegree: 32 + efConstruction: 512 + # search param + limit: 1 \ No newline at end of file diff --git a/vector-search-simulator/vector-test/vectors-datasets-tests.yaml b/vector-search-simulator/vector-test/vectors-datasets-tests.yaml new file mode 100644 index 0000000000..d422aa1773 --- /dev/null +++ b/vector-search-simulator/vector-test/vectors-datasets-tests.yaml @@ -0,0 +1,52 @@ +- name: read_only + duration: 30m + repetitions: 1 + clients: 1 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.5.0-SNAPSHOT + client_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + member_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: LICENSE + test: + - class: com.hazelcast.simulator.tests.vector.VectorCollectionFromUrlTest + name: vector + threadCount: 1 +# ratePerSecond: 5_000 +# interval: 100us + logRateMs: 60_000 + searchProb: 1 +# numberOfIterations: 1_000_000 + # generated data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 30_000 + # index params + metric: COSINE + maxDegree: 32 + efConstruction: 512 + # search param + limit: 1 + From f83f2a7a0d00360436e1cc3314e5ffa9004d930f Mon Sep 17 00:00:00 2001 From: oshultseva Date: Thu, 9 May 2024 12:09:40 +0100 Subject: [PATCH 2/8] vector db test --- .../simulator/tests/vector/DatasetReader.java | 33 ++-- .../VectorCollectionSearchDatasetTest.java | 164 ++++++++++++++---- .../simulator/tests/vector/VectorUtils.java | 25 +++ .../tests/vector/model/TestDataset.java | 56 ++++++ .../vector/readers/HDF5DatasetReader.java | 39 ++++- .../readers/NpyArchiveDatasetReader.java | 29 +++- .../vector/model/TestDatasetDiffblueTest.java | 71 ++++++++ .../simulator/utils/DatasetReaderTest.java | 4 +- 8 files changed, 350 insertions(+), 71 deletions(-) create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java create mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java create mode 100644 java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java index 770aa13ef9..5b37bdfd5b 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java @@ -1,5 +1,6 @@ package com.hazelcast.simulator.tests.vector; +import com.hazelcast.simulator.tests.vector.model.TestDataset; import com.hazelcast.simulator.tests.vector.readers.HDF5DatasetReader; import com.hazelcast.simulator.tests.vector.readers.NpyArchiveDatasetReader; import org.apache.commons.io.FileUtils; @@ -32,7 +33,7 @@ public abstract class DatasetReader { protected float[][] trainDataset; - protected float[][] testDataset; + protected TestDataset testDataset; protected int dimension; @@ -46,11 +47,11 @@ public DatasetReader(String url, String directory) { this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile())); this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile(); - logger.info("Start downloading file from " + datasetURL); + logger.info("Start downloading file from {}", datasetURL); if (!downloadedFile.exists()) { download(); } - logger.info("File downloaded to " + downloadedFile + ". Start unpacking..."); + logger.info("File downloaded to {}. Start unpacking...", downloadedFile); preprocessDatasetFile(); parseTrainDataset(); @@ -63,7 +64,6 @@ public DatasetReader(String url, String directory) { protected abstract void preprocessDatasetFile(); protected abstract void parseTrainDataset(); - protected abstract void parseTestDataset(); private void cleanup() { @@ -78,7 +78,7 @@ public float[] getTrainVector(int index) { return trainDataset[index]; } - public float[][] getTestDataset() { + public TestDataset getTestDataset() { return testDataset; } @@ -91,10 +91,7 @@ public int getSize() { } public int getTestDatasetDimension() { - if(testDataset.length == 0) { - return 0; - } - return testDataset[0].length; + return testDataset.getDimension(); } private void download() { @@ -112,21 +109,15 @@ private void download() { } - private static class FileDownloadResponseHandler implements ResponseHandler { - - private final File target; - - public FileDownloadResponseHandler(File target) { - this.target = target; - } + private record FileDownloadResponseHandler(File target) implements ResponseHandler { @Override - public Void handleResponse(HttpResponse response) throws IOException { - InputStream source = response.getEntity().getContent(); - FileUtils.copyInputStreamToFile(source, this.target); - return null; + public Void handleResponse(HttpResponse response) throws IOException { + InputStream source = response.getEntity().getContent(); + FileUtils.copyInputStreamToFile(source, this.target); + return null; + } } - } public static DatasetReader create(String url, String directory) { try { diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java index 8766daaa36..b9865a1fad 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java @@ -3,23 +3,34 @@ import com.hazelcast.config.vector.Metric; import com.hazelcast.config.vector.VectorCollectionConfig; import com.hazelcast.config.vector.VectorIndexConfig; +import com.hazelcast.core.Pipelining; import com.hazelcast.simulator.hz.HazelcastTest; import com.hazelcast.simulator.test.BaseThreadState; import com.hazelcast.simulator.test.annotations.AfterRun; import com.hazelcast.simulator.test.annotations.Setup; import com.hazelcast.simulator.test.annotations.TimeStep; +import com.hazelcast.simulator.tests.vector.model.TestDataset; import com.hazelcast.vector.SearchOptions; import com.hazelcast.vector.SearchOptionsBuilder; +import com.hazelcast.vector.SearchResults; import com.hazelcast.vector.VectorCollection; import com.hazelcast.vector.VectorDocument; import com.hazelcast.vector.VectorValues; +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; -import java.util.concurrent.TimeUnit; +import java.util.Queue; +import java.util.concurrent.CompletionStage; +import java.util.concurrent.ConcurrentLinkedQueue; import java.util.concurrent.atomic.AtomicInteger; +import java.util.function.Function; -import static java.lang.String.format; +import static java.util.concurrent.TimeUnit.MILLISECONDS; public class VectorCollectionSearchDatasetTest extends HazelcastTest { @@ -48,12 +59,18 @@ public class VectorCollectionSearchDatasetTest extends HazelcastTest { private static final String collectionName = "performance-collection"; private static final int PUT_BATCH_SIZE = 10_000; + + private static final int MAX_PUT_ALL_IN_FLIGHT = 5; + private VectorCollection collection; private final AtomicInteger searchCounter = new AtomicInteger(0); private final AtomicInteger putCounter = new AtomicInteger(0); - private float[][] testDataset; + + private TestDataset testDataset; + + private final Queue searchResults = new ConcurrentLinkedQueue<>(); @Setup public void setup() { @@ -62,7 +79,7 @@ public void setup() { int dimension = reader.getDimension(); assert dimension == reader.getTestDatasetDimension() : "dataset dimension does not correspond to query vector dimension"; testDataset = reader.getTestDataset(); - numberOfSearchIterations = Math.min(numberOfSearchIterations, testDataset.length); + numberOfSearchIterations = Math.min(numberOfSearchIterations, testDataset.size()); collection = VectorCollection.getCollection( targetInstance, @@ -80,64 +97,139 @@ public void setup() { Map> buffer = new HashMap<>(); int index; + Pipelining pipelining = new Pipelining<>(MAX_PUT_ALL_IN_FLIGHT); logger.info("Start loading data..."); + while ((index = putCounter.getAndIncrement()) < size) { buffer.put(index, VectorDocument.of(index, VectorValues.of(reader.getTrainVector(index)))); if (buffer.size() % PUT_BATCH_SIZE == 0) { - var blockStart = System.currentTimeMillis(); - collection.putAllAsync(buffer).toCompletableFuture().join(); + addToPipelineWithLogging(pipelining, collection.putAllAsync(buffer)); logger.info( - format( - "Uploaded %s vectors from %s. Block size: %s. Delta (m): %s. Total time (m): %s", - index, - size, - buffer.size(), - TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - blockStart), - TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - start) - ) + "Uploaded {} vectors from {}. Block size: {}. Total time (min): {}", + index, + size, + buffer.size(), + MILLISECONDS.toMinutes(System.currentTimeMillis() - start) ); - buffer.clear(); + buffer = new HashMap<>(); } } if (!buffer.isEmpty()) { - collection.putAllAsync(buffer).toCompletableFuture().join(); - logger.info(format("Uploaded vectors. Last block size: %s.", buffer.size())); + addToPipelineWithLogging(pipelining, collection.putAllAsync(buffer)); + logger.info("Uploading last vectors block. Last block size: {}.", buffer.size()); buffer.clear(); } - var startCleanup = System.currentTimeMillis(); - collection.optimizeAsync().toCompletableFuture().join(); - logger.info("Collection size: " + size); - logger.info("Collection dimension: " + reader.getDimension()); - logger.info("Cleanup time(ms): " + (System.currentTimeMillis() - startCleanup)); - logger.info("Index build time(m): " + TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - start)); + logger.info("Start waiting pipeline results..."); + var pipelineWaiting = withTimer(() -> { + try { + pipelining.results(); + } catch (Exception e) { + throw new RuntimeException(e); + } + }); + logger.info("Pipeline waiting finished in {} min", MILLISECONDS.toMinutes(pipelineWaiting)); + + var cleanupTimer = withTimer(() -> collection.optimizeAsync().toCompletableFuture().join()); + + logger.info("Collection size: {}", size); + logger.info("Collection dimension: {}", reader.getDimension()); + logger.info("Cleanup time (min): {}", MILLISECONDS.toMinutes(cleanupTimer)); + logger.info("Index build time (min): {}", MILLISECONDS.toMinutes(System.currentTimeMillis() - start)); } - @TimeStep(prob = 1) + @TimeStep() public void search(ThreadState state) { - var iteration = searchCounter.incrementAndGet(); + var iteration = searchCounter.getAndIncrement(); if (iteration >= numberOfSearchIterations) { testContext.stop(); return; } - var vector = testDataset[iteration]; - SearchOptions options = new SearchOptionsBuilder().vector(vector).includePayload().limit(limit).build(); + var vector = testDataset.getSearchVector(iteration); + SearchOptions options = new SearchOptionsBuilder().vector(vector).includePayload().includeVectors().limit(limit).build(); var result = collection.searchAsync(options).toCompletableFuture().join(); - - var score = result.results().next().getScore(); - ScoreMetrics.set((int) (score * 100)); - logger.info("Found score: " + result.results().next().getScore()); + searchResults.add(new TestSearchResult(iteration, vector, result)); } @AfterRun public void afterRun() { - logger.info("Number of search iteration: " + searchCounter.get()); - logger.info("Min score: " + ScoreMetrics.getMin()); - logger.info("Max score: " + ScoreMetrics.getMax()); - logger.info("Mean score: " + ScoreMetrics.getMean()); - logger.info("Percent lower then 0.98: " + ScoreMetrics.getPercentLowerThen(98)); + searchResults.forEach(testSearchResult -> { + int index = testSearchResult.index(); + List ids = new ArrayList<>(); + VectorUtils.forEach(testSearchResult.results, r -> ids.add((Integer) r.getKey())); + ScoreMetrics.set((int) (testDataset.getPrecisionV1(ids, index, limit) * 100)); + }); + + writePureResultsToFile("precision.out"); + logger.info("Number of search iteration: {}", searchCounter.get()); + logger.info("Min score: {}", ScoreMetrics.getMin()); + logger.info("Max score: {}", ScoreMetrics.getMax()); + logger.info("Mean score: {}", ScoreMetrics.getMean()); + logger.info("Percent of results lower then 98% precision: {}", ScoreMetrics.getPercentLowerThen(98)); } public static class ThreadState extends BaseThreadState { } + + public record TestSearchResult(int index, float[] searchVector, SearchResults results) { + } + + private void writePureResultsToFile(String fileName) { + try { + Function restore = VectorUtils.restoreRealMetric(Metric.valueOf(metric)); + var fileWriter = new FileWriter(fileName); + PrintWriter printWriter = new PrintWriter(fileWriter); + printWriter.println("index, searchVector0, foundVector0, foundVectorKey, foundVectorScore, restoredRealVectorScore"); + searchResults.forEach( + testSearchResult -> VectorUtils.forEach( + testSearchResult.results, + (result) -> printWriter.printf( + "%d, %s, %s, %s, %s, %s\n", + testSearchResult.index, + testSearchResult.searchVector[0], + getFirstCoordinate(result.getVectors()), + result.getKey(), + result.getScore(), + restore.apply(result.getScore()) + ) + ) + ); + printWriter.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + void addToPipelineWithLogging(Pipelining pipelining, CompletionStage asyncInvocation) { + var now = System.currentTimeMillis(); + try { + pipelining.add(asyncInvocation); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + var msBlocked = System.currentTimeMillis() - now; + // log if we were blocked for more than 30 sec + if (msBlocked > 30_000) { + logger.info( + "Thread was blocked for {} sec due to reaching max pipeline depth", + MILLISECONDS.toSeconds(msBlocked) + ); + } + } + + private long withTimer(Runnable runnable) { + var start = System.currentTimeMillis(); + runnable.run(); + return System.currentTimeMillis() - start; + } + + private float getFirstCoordinate(VectorValues vectorValues) { + var v = (VectorValues.SingleVectorValues) vectorValues; + if(v == null || v.vector().length == 0) { + return 0; + } + return v.vector()[0]; + } + + } diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java new file mode 100644 index 0000000000..84e42dcee3 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java @@ -0,0 +1,25 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.vector.SearchResult; +import com.hazelcast.vector.SearchResults; +import com.hazelcast.config.vector.Metric; + +import java.util.function.Consumer; +import java.util.function.Function; + +public class VectorUtils { + + public static void forEach(SearchResults searchResults, Consumer consumer) { + var resultsIterator = searchResults.results(); + while (resultsIterator.hasNext()) { + consumer.accept(resultsIterator.next()); + } + } + + public static Function restoreRealMetric(Metric metric) { + return switch (metric) { + case COSINE -> jMetric -> 2 * jMetric - 1; + default -> jMetric -> -1f; + }; + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java new file mode 100644 index 0000000000..bcf9066761 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java @@ -0,0 +1,56 @@ +package com.hazelcast.simulator.tests.vector.model; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.stream.Collectors; + +public class TestDataset { + + private final float[][] searchVectors; + + private final int[][] closestIds; + + private final float[][] closestScores; + + public TestDataset(float[][] searchVector, int[][] closestIds, float[][] closestScores) { + this.searchVectors = searchVector; + this.closestIds = closestIds; + this.closestScores = closestScores; + } + + public float[] getSearchVector(int index) { + return searchVectors[index]; + } + + public int getDimension() { + if(searchVectors.length == 0) { + return 0; + } + return searchVectors[0].length; + } + + public int size() { + return searchVectors.length; + } + + public float getPrecisionV1(List actualVectorsIds, int index, int top) { + var actualSet = new HashSet<>(actualVectorsIds); + var expectedSet = Arrays.stream(Arrays.copyOfRange(closestIds[index], 0, top)).boxed().collect(Collectors.toSet()); + actualSet.retainAll(expectedSet); + return ((float) actualSet.size()) / top; + } + + public float getPrecisionV2(float[] actualVectorsScore, int index) { + var expected = Arrays.copyOfRange(closestScores[index], 0, actualVectorsScore.length); + return distance(actualVectorsScore, expected); + } + + private float distance(float[] array1, float[] array2) { + double sum = 0f; + for (int i = 0; i < array1.length; i++) { + sum += Math.pow((array1[i] - array2[i]), 2.0); + } + return (float) Math.sqrt(sum); + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java index c4925d31d2..d33da74efc 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java @@ -1,12 +1,13 @@ package com.hazelcast.simulator.tests.vector.readers; import com.hazelcast.simulator.tests.vector.DatasetReader; +import com.hazelcast.simulator.tests.vector.model.TestDataset; import io.jhdf.HdfFile; import io.jhdf.api.Dataset; public class HDF5DatasetReader extends DatasetReader { - private static int BULK_READER_SIZE = 50_000; + private static final int BULK_READER_SIZE = 50_000; public HDF5DatasetReader(String url, String directory) { super(url, directory); @@ -19,32 +20,54 @@ protected void preprocessDatasetFile() { @Override protected void parseTrainDataset() { - trainDataset = getDataset("train"); + trainDataset = getDatasetAsFloatMatrix("train"); size = trainDataset.length; dimension = trainDataset[0].length; } @Override protected void parseTestDataset() { - testDataset = getDataset("test"); + var searchVectors = getDatasetAsFloatMatrix("test"); + // todo - now scores in reversed order + var ids = getDatasetAsIntMatrix("neighbors"); + var scores = getDatasetAsFloatMatrix("distances"); + testDataset = new TestDataset(searchVectors, ids, scores); } - private float[][] getDataset(String datasetName) { + private float[][] getDatasetAsFloatMatrix(String datasetName) { try (HdfFile hdfFile = new HdfFile(downloadedFile.toPath())) { var datasetNode = hdfFile.getChildren().get(datasetName); Dataset dataset = hdfFile.getDatasetByPath(datasetNode.getPath()); var dimension = dataset.getDimensions()[1]; var size = dataset.getDimensions()[0]; - float[][] array = new float[size][dimension]; + float[][] matrix = new float[size][dimension]; for (int i = 0; i < size; i += BULK_READER_SIZE) { int length = Math.min(BULK_READER_SIZE, size - i); - float[][] buffer = (float[][]) dataset.getData(new long[] {i, 0}, new int[]{length, dimension}); - System.arraycopy(buffer, 0, array, i, buffer.length); + float[][] buffer = (float[][]) dataset.getData(new long[]{i, 0}, new int[]{length, dimension}); + System.arraycopy(buffer, 0, matrix, i, buffer.length); } - return array; + return matrix; } } + // todo - refactor one file once + private int[][] getDatasetAsIntMatrix(String datasetName) { + try (HdfFile hdfFile = new HdfFile(downloadedFile.toPath())) { + var datasetNode = hdfFile.getChildren().get(datasetName); + Dataset dataset = hdfFile.getDatasetByPath(datasetNode.getPath()); + var dimension = dataset.getDimensions()[1]; + var size = dataset.getDimensions()[0]; + + int[][] matrix = new int[size][dimension]; + + for (int i = 0; i < size; i += BULK_READER_SIZE) { + int length = Math.min(BULK_READER_SIZE, size - i); + int[][] buffer = (int[][]) dataset.getData(new long[]{i, 0}, new int[]{length, dimension}); + System.arraycopy(buffer, 0, matrix, i, buffer.length); + } + return matrix; + } + } } diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java index 16f7360f1d..b0a3f19819 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java @@ -3,6 +3,7 @@ import com.google.gson.JsonArray; import com.google.gson.JsonParser; import com.hazelcast.simulator.tests.vector.DatasetReader; +import com.hazelcast.simulator.tests.vector.model.TestDataset; import org.apache.commons.io.FileUtils; import org.jetbrains.bio.npy.NpyArray; import org.jetbrains.bio.npy.NpyFile; @@ -58,17 +59,29 @@ protected void parseTestDataset() { try { var parser = new JsonParser(); List queryList = FileUtils.readLines(testDatesetFilename.toFile(), Charset.defaultCharset()); - testDataset = new float[queryList.size()][dimension]; + int size = queryList.size(); + var searchVectors = new float[size][dimension]; + var searchClosestIds = new int[size][]; + var searchClosestScore = new float[size][]; for (int i = 0; i < queryList.size(); i++) { - var jsonArray = parser.parse(queryList.get(i)).getAsJsonObject().getAsJsonArray("query"); - testDataset[i] = convert(jsonArray); + var queryObject = parser.parse(queryList.get(i)).getAsJsonObject(); + var jsonArray = queryObject.getAsJsonArray("query"); + var ids = queryObject.getAsJsonArray("closest_ids"); + var scores = queryObject.getAsJsonArray("closest_scores"); + searchVectors[i] = convertToFloatArray(jsonArray); + searchClosestIds[i] = convertToIntArray(ids); + searchClosestScore[i] = convertToFloatArray(scores); + } + testDataset = new TestDataset(searchVectors, searchClosestIds, searchClosestScore); } catch (IOException e) { throw new RuntimeException(e); } } - private float[] convert(JsonArray array) { + + + private float[] convertToFloatArray(JsonArray array) { var result = new float[array.size()]; for (int i = 0; i < array.size(); i++) { result[i] = array.get(i).getAsFloat(); @@ -76,6 +89,14 @@ private float[] convert(JsonArray array) { return result; } + private int[] convertToIntArray(JsonArray array) { + var result = new int[array.size()]; + for (int i = 0; i < array.size(); i++) { + result[i] = array.get(i).getAsInt(); + } + return result; + } + @Override public float[] getTrainVector(int index) { if (index >= size) { diff --git a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java new file mode 100644 index 0000000000..ec92638fae --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java @@ -0,0 +1,71 @@ +package com.hazelcast.simulator.tests.vector.model; + +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class TestDatasetDiffblueTest { + @Test + public void testGetPrecisionV1_score100() { + var actual = ( + new TestDataset( + new float[][]{new float[]{0f}}, + new int[][]{new int[]{1, 2, 3, 4}}, + new float[][]{new float[]{0f}} + ) + ).getPrecisionV1(List.of(1, 2), 0, 2); + assertEquals(1, actual, 0.0f); + } + + @Test + public void testGetPrecisionV1_score0() { + assertEquals(0.0f, + ( + new TestDataset( + new float[][]{new float[]{0f}}, + new int[][]{new int[]{1, 2, 3, 4}, new int[]{1, 2, 1, 2}}, + new float[][]{new float[]{0f}} + ) + ).getPrecisionV1(List.of(2), 0, 1), + 0.0f); + } + + @Test + public void testGetPrecisionV1_score50() { + assertEquals(0.5f, + ( + new TestDataset( + new float[][]{new float[]{0f}}, + new int[][]{new int[]{1, 2, 3, 4}, new int[]{2, 5, 6}}, + new float[][]{new float[]{0f}} + ) + ).getPrecisionV1(List.of(2, 6), 0, 2), + 0.1f); + } + + @Test + public void testGetPrecisionV2_theSameVector() { + var actual = ( + new TestDataset( + new float[][]{new float[]{0f}}, + new int[][]{new int[]{1, 2, 3, 4}}, + new float[][]{new float[]{10.0f, 0.5f, 10.0f, 0.5f}} + ) + ).getPrecisionV2(new float[]{10.0f, 0.5f, 10.0f, 0.5f}, 0); + assertEquals(0, actual, 0.0f); + } + + @Test + public void testGetPrecisionV2_DiffrentVector() { + var actual = ( + new TestDataset( + new float[][]{new float[]{0f}}, + new int[][]{new int[]{1, 2, 3, 4}}, + new float[][]{new float[]{10.0f, 0.5f, 10.0f, 0.5f}} + ) + ).getPrecisionV2(new float[]{8.0f, 0.4f, 9.0f, 0.5f}, 0); + assertEquals(Math.sqrt(5.01), actual, 0.1f); + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java index f4636b7a3b..edca8e00b0 100644 --- a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java +++ b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java @@ -25,7 +25,7 @@ public void npyArchive() { assertEquals(1536, reader.getTestDatasetDimension()); assertNotNull(reader.getTrainVector(1234)); - assertNotNull(reader.getTestDataset()); + assertEquals(5000, reader.getTestDataset().size()); } @Test @@ -39,6 +39,6 @@ public void hdf5() { assertEquals(960, reader.getTestDatasetDimension()); assertNotNull(reader.getTrainVector(1234)); - assertNotNull(reader.getTestDataset()); + assertEquals(1000, reader.getTestDataset().size()); } } From 6c8ed0fb3e375bea617cea97df18c18de59c37dc Mon Sep 17 00:00:00 2001 From: oshultseva Date: Fri, 17 May 2024 16:20:09 +0100 Subject: [PATCH 3/8] vector db test --- .../simulator/tests/vector/DatasetReader.java | 13 +- .../vector/VectorCollectionFromUrlTest.java | 151 ------------------ .../VectorCollectionPutDatasetTest.java | 84 ++++++++-- .../vector/VectorCollectionRandomTest.java | 145 ----------------- .../VectorCollectionSearchDatasetTest.java | 7 +- .../simulator/tests/vector/VectorUtils.java | 11 ++ .../vector/readers/HDF5DatasetReader.java | 16 +- .../readers/NpyArchiveDatasetReader.java | 26 +-- .../simulator/utils/DatasetReaderTest.java | 30 +++- 9 files changed, 148 insertions(+), 335 deletions(-) delete mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionFromUrlTest.java delete mode 100644 java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionRandomTest.java diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java index 5b37bdfd5b..9beff2c095 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java @@ -39,13 +39,19 @@ public abstract class DatasetReader { protected int size; + protected Boolean normalizeVector = false; + protected final Logger logger = LogManager.getLogger(getClass()); public DatasetReader(String url, String directory) { + this(url, directory, false); + } + public DatasetReader(String url, String directory, Boolean normalizeVector) { try { this.datasetURL = URI.create(url).toURL(); this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile())); this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile(); + this.normalizeVector = normalizeVector; logger.info("Start downloading file from {}", datasetURL); if (!downloadedFile.exists()) { @@ -119,18 +125,17 @@ public Void handleResponse(HttpResponse response) throws IOException { } } - public static DatasetReader create(String url, String directory) { + public static DatasetReader create(String url, String directory, boolean normalizeVector) { try { URL datasetUrl = URI.create(url).toURL(); var ext = FilenameUtils.getExtension(datasetUrl.getFile()); return switch (ext) { - case "hdf5" -> new HDF5DatasetReader(url, directory); - case "tgz" -> new NpyArchiveDatasetReader(url, directory); + case "hdf5" -> new HDF5DatasetReader(url, directory, normalizeVector); + case "tgz" -> new NpyArchiveDatasetReader(url, directory, normalizeVector); default -> throw new UnsupportedOperationException("File " + ext + " is not supported"); }; } catch (MalformedURLException e) { throw new RuntimeException(e); } - } } diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionFromUrlTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionFromUrlTest.java deleted file mode 100644 index ac73a73e44..0000000000 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionFromUrlTest.java +++ /dev/null @@ -1,151 +0,0 @@ -package com.hazelcast.simulator.tests.vector; - -import com.hazelcast.config.vector.Metric; -import com.hazelcast.config.vector.VectorCollectionConfig; -import com.hazelcast.config.vector.VectorIndexConfig; -import com.hazelcast.simulator.hz.HazelcastTest; -import com.hazelcast.simulator.test.BaseThreadState; -import com.hazelcast.simulator.test.annotations.AfterRun; -import com.hazelcast.simulator.test.annotations.Setup; -import com.hazelcast.simulator.test.annotations.TimeStep; -import com.hazelcast.vector.SearchOptions; -import com.hazelcast.vector.SearchOptionsBuilder; -import com.hazelcast.vector.VectorCollection; -import com.hazelcast.vector.VectorDocument; -import com.hazelcast.vector.VectorValues; - -import java.util.List; -import java.util.concurrent.CompletableFuture; -import java.util.concurrent.TimeUnit; -import java.util.concurrent.atomic.AtomicInteger; - -import static org.junit.Assert.assertEquals; - -@Deprecated -public class VectorCollectionFromUrlTest extends HazelcastTest { - - public String datasetUrl; - - public String workingDirectory; - - // common parameters - public int numberOfSearchIterations = Integer.MAX_VALUE; - - public int loadFirst = Integer.MAX_VALUE; - - // graph parameters - public String metric = "COSINE"; - - public int maxDegree = 40; - - public int efConstruction = 50; - - // search parameters - - public int limit = 1; - - // inner test parameters - - private static final String collectionName = "performance-collection"; - - private static final int PUT_BATCH_SIZE = 10_000; - private VectorCollection collection; - - private final AtomicInteger counter = new AtomicInteger(0); - private List query; - - @Setup - public void setup() { - NpyDatasetReader reader = new NpyDatasetReader(datasetUrl, workingDirectory); - var size = Math.min(reader.getSize(), loadFirst); - int dimension = reader.getDimension(); - assert dimension == reader.getQueryDimension() : "dataset dimension does not correspond to query vector dimension"; - query = reader.getTestCases(); - numberOfSearchIterations = Math.min(numberOfSearchIterations, query.size()); - - collection = VectorCollection.getCollection( - targetInstance, - new VectorCollectionConfig(collectionName) - .addVectorIndexConfig( - new VectorIndexConfig() - .setMetric(Metric.valueOf(metric)) - .setDimension(dimension) - .setMaxDegree(maxDegree) - .setEfConstruction(efConstruction) - ) - ); - - var start = System.currentTimeMillis(); - for (int i = 0; i < size; i += PUT_BATCH_SIZE) { - logger.info( - String.format( - "Uploaded %s vectors from %s. Time: %s", - i, - size, - TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - start) - ) - ); - int currentBatchLength = Math.min(PUT_BATCH_SIZE, size - i); - int[] keys = new int[currentBatchLength]; - float[][] vectors = new float[currentBatchLength][]; - for (int j = 0; j < currentBatchLength; j++) { - keys[j] = i + j; - vectors[j] = reader.read(i + j); - } - putBatchSync(keys, vectors); - } - logger.info("Collection size: " + size); - logger.info("Collection dimension: " + reader.getDimension()); - logger.info("Index build time(m): " + TimeUnit.MILLISECONDS.toMinutes(System.currentTimeMillis() - start)); - } - - @TimeStep(prob = 0) - public void search(ThreadState state) { - var iteration = counter.incrementAndGet(); - if (iteration >= numberOfSearchIterations) { - testContext.stop(); - return; - } - var vector = query.get(iteration); - SearchOptions options = new SearchOptionsBuilder().vector(vector).includePayload().limit(limit).build(); - var result = collection.searchAsync(options).toCompletableFuture().join(); - assertEquals(limit, result.size()); - var score = result.results().next().getScore(); - ScoreMetrics.set((int) (score * 100)); - logger.info("Found score: " + result.results().next().getScore()); - } - - @AfterRun - public void afterRun() { - logger.info("Number of search iteration: " + counter.get()); - logger.info("Min score: " + ScoreMetrics.getMin()); - logger.info("Max score: " + ScoreMetrics.getMax()); - logger.info("Mean score: " + ScoreMetrics.getMean()); - logger.info("Percent lower then 0.98: " + ScoreMetrics.getPercentLowerThen(98)); - } - - private void putSync(int key, float[] vector) { - collection.putAsync( - key, - VectorDocument.of(key, VectorValues.of(vector)) - ) - .toCompletableFuture() - .join(); - } - - private void putBatchSync(int[] keys, float[][] vectors) { - CompletableFuture[] futures = new CompletableFuture[keys.length]; - for (int i = 0; i < keys.length; i++) { - futures[i] = collection.putAsync( - keys[i], - VectorDocument.of(keys[i], VectorValues.of(vectors[i])) - ) - .toCompletableFuture(); - } - CompletableFuture.allOf(futures).join(); - } - - - public static class ThreadState extends BaseThreadState { - } -} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java index d2c64a4b33..bc64d86618 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java @@ -11,8 +11,11 @@ import com.hazelcast.vector.VectorCollection; import com.hazelcast.vector.VectorDocument; import com.hazelcast.vector.VectorValues; +import org.HdrHistogram.Histogram; +import java.util.ArrayList; import java.util.HashMap; +import java.util.List; import java.util.Map; import java.util.concurrent.atomic.AtomicInteger; @@ -37,15 +40,18 @@ public class VectorCollectionPutDatasetTest extends HazelcastTest { // inner test parameters private static final String collectionName = "performance-collection"; + + private static final TimeMetrics metrics = new TimeMetrics(); private VectorCollection collection; private final AtomicInteger counter = new AtomicInteger(0); private DatasetReader reader; + private List>> buffers = new ArrayList<>(); @Setup public void setup() { - reader = DatasetReader.create(datasetUrl, workingDirectory); + reader = DatasetReader.create(datasetUrl, workingDirectory, false); int dimension = reader.getDimension(); loadFirst = Math.min(loadFirst, reader.getSize()); @@ -64,21 +70,23 @@ public void setup() { @TimeStep(prob = 0) public void put(ThreadState state) { - var iteration = counter.incrementAndGet(); + var iteration = counter.getAndIncrement(); if (iteration >= loadFirst) { testContext.stop(); return; } var vector = reader.getTrainVector(iteration); - collection.putAsync( - iteration, - VectorDocument.of(iteration, VectorValues.of(vector)) - ) - .toCompletableFuture() - .join(); + metrics.recordPut( + () -> collection.putAsync( + iteration, + VectorDocument.of(iteration, VectorValues.of(vector)) + ) + .toCompletableFuture() + .join() + ); } - @TimeStep(prob = 0) + @TimeStep(prob = 1) public void putAll(ThreadState state) { var iteration = counter.getAndAdd(putBatchSize); if (iteration >= loadFirst) { @@ -86,20 +94,62 @@ public void putAll(ThreadState state) { return; } Map> buffer = new HashMap<>(); - for (int i = 0; i < putBatchSize; i++) { - var key = iteration + i; - var vector = reader.getTrainVector(key); - buffer.put(key, VectorDocument.of(key, VectorValues.of(vector))); - } - collection.putAllAsync(buffer) - .toCompletableFuture() - .join(); + metrics.recordBuffer( + () -> { + for (int i = 0; i < putBatchSize; i++) { + var key = iteration + i; + if (key >= reader.size) { + break; + } + var vector = reader.getTrainVector(key); + buffer.put(key, VectorDocument.of(key, VectorValues.of(vector))); + } + } + ); + + metrics.recordPut( + () -> collection.putAllAsync(buffer) + .toCompletableFuture() + .join() + ); } @AfterRun public void afterRun() { + logger.info("****CUSTOM STATISTICS****"); + logger.info(metrics.getStatistics()); } public static class ThreadState extends BaseThreadState { } + + + public static class TimeMetrics { + private static final Histogram bufferTimer = new Histogram(2); + private static final Histogram putTimer = new Histogram(2); + + + public void recordBuffer(Runnable action) { + var start = System.currentTimeMillis(); + action.run(); + bufferTimer.recordValue(System.currentTimeMillis() - start); + } + + public void recordPut(Runnable action) { + var start = System.currentTimeMillis(); + action.run(); + putTimer.recordValue(System.currentTimeMillis() - start); + } + + public String getStatistics() { + return "\nBuffer 95p: " + bufferTimer.getValueAtPercentile(95) + "\n" + + "Buffer max: " + bufferTimer.getMaxValue() + "\n" + + "Buffer min: " + bufferTimer.getMinValue() + "\n" + + "Buffer mean: " + bufferTimer.getMean() + "\n" + + "Put 95p: " + putTimer.getValueAtPercentile(95) + "\n" + + "Put max: " + putTimer.getMaxValue() + "\n" + + "Put min: " + putTimer.getMinValue() + "\n" + + "Put mean: " + putTimer.getMean() + "\n"; + } + } } diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionRandomTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionRandomTest.java deleted file mode 100644 index ece9ea1731..0000000000 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionRandomTest.java +++ /dev/null @@ -1,145 +0,0 @@ -package com.hazelcast.simulator.tests.vector; - -import com.hazelcast.config.vector.Metric; -import com.hazelcast.config.vector.VectorCollectionConfig; -import com.hazelcast.config.vector.VectorIndexConfig; -import com.hazelcast.simulator.hz.HazelcastTest; -import com.hazelcast.simulator.test.BaseThreadState; -import com.hazelcast.simulator.test.annotations.Setup; -import com.hazelcast.simulator.test.annotations.TimeStep; -import com.hazelcast.simulator.test.annotations.Verify; -import com.hazelcast.vector.SearchOptions; -import com.hazelcast.vector.SearchOptionsBuilder; -import com.hazelcast.vector.VectorCollection; -import com.hazelcast.vector.VectorDocument; -import com.hazelcast.vector.VectorValues; -import com.hazelcast.vector.impl.SingleIndexVectorValues; - -import java.util.Arrays; -import java.util.Random; -import java.util.concurrent.atomic.AtomicInteger; -import java.util.function.Supplier; - -import static java.util.stream.IntStream.range; -import static org.junit.Assert.assertEquals; -import static org.junit.Assert.assertNotNull; - -@Deprecated -public class VectorCollectionRandomTest extends HazelcastTest { - // common parameters - public int numberOfIterations = 0; - public int initialCollectionSize = 0; - - public int minCoordinate = 0; - - public int maxCoordinate = 100; - - public boolean generateOnlyIntCoordinate = false; - - // graph parameters - public int dimension = 10; - - public String metric = "EUCLIDEAN"; - - public int maxDegree = 40; - - public int efConstruction = 50; - - // search parameters - - public int limit = 100; - - - // inner test parameters - - private final String collectionName = "performance-collection"; - private VectorCollection collection; - - private final AtomicInteger counter = new AtomicInteger(initialCollectionSize); - - private final Random RANDOM = new Random(); - private long start; - - @Setup - public void setup() { - collection = VectorCollection.getCollection( - targetInstance, - new VectorCollectionConfig(collectionName) - .addVectorIndexConfig( - new VectorIndexConfig() - .setMetric(Metric.valueOf(metric)) - .setDimension(dimension) - .setMaxDegree(maxDegree) - .setEfConstruction(efConstruction) - ) - ); - - for (int i = 0; i < initialCollectionSize; i++) { - generateVectorAndPutSync(i); - } - start = System.currentTimeMillis(); - logger.info("Generated collection size: " + initialCollectionSize); - logger.info("Start time: " + start); - - } - - @TimeStep - public void put(ThreadState state) { - var key = counter.incrementAndGet(); - if(numberOfIterations > 0 && key >= numberOfIterations - initialCollectionSize) { - testContext.stop(); - return; - } - generateVectorAndPutSync(key); - } - - @TimeStep(prob = 0) - public void search(ThreadState state) { - var iteration = counter.incrementAndGet(); - if(numberOfIterations > 0 && iteration >= numberOfIterations) { - testContext.stop(); - return; - } - var vector = generateVector(dimension, 0, maxCoordinate); - SearchOptions options = new SearchOptionsBuilder().vector(vector).includePayload().limit(limit).build(); - var result = collection.searchAsync(options).toCompletableFuture().join(); - assertEquals(limit, result.size()); - } - - private void generateVectorAndPutSync(int key) { - collection.putAsync( - key, - VectorDocument.of(key, VectorValues.of(generateVector(dimension, minCoordinate, maxCoordinate))) - ) - .toCompletableFuture() - .join(); - } - - - @Verify - public void verify() { - logger.info("======== VERIFYING VECTOR COLLECTION ========="); - logger.info("Collection size is: " + counter.get()); - logger.info("Test duration (minutes): " + (System.currentTimeMillis() - start)/60_000); - int verifySize = counter.get() / 100; - int[] verifyKeys = RANDOM.ints(verifySize, 0, counter.get()).toArray(); - for (int key : verifyKeys) { - VectorDocument value = collection.getAsync(key).toCompletableFuture().join(); - assertNotNull("No value found for the key: " + key, value); - var vector = ((SingleIndexVectorValues)value.getVectors()).vector(); - logger.info(key + " - " + Arrays.toString(vector)); - } - } - - public class ThreadState extends BaseThreadState { - } - - private float[] generateVector(int length, int minValue, int maxValue) { - assert minValue >= 0 : "negative min value is not supported"; - Supplier generator = generateOnlyIntCoordinate ? () -> (float) RANDOM.nextInt(minValue, maxValue) : () -> RANDOM.nextFloat(minValue, maxValue); - float[] value = new float[length]; - range(0, length).forEach(i -> value[i] = generator.get()); - return value; - } - -} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java index b9865a1fad..6aa85a5060 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java @@ -50,6 +50,8 @@ public class VectorCollectionSearchDatasetTest extends HazelcastTest { public int efConstruction = 50; + public boolean normalize = false; + // search parameters public int limit = 1; @@ -58,7 +60,7 @@ public class VectorCollectionSearchDatasetTest extends HazelcastTest { private static final String collectionName = "performance-collection"; - private static final int PUT_BATCH_SIZE = 10_000; + private static final int PUT_BATCH_SIZE = 2_000; private static final int MAX_PUT_ALL_IN_FLIGHT = 5; @@ -74,7 +76,8 @@ public class VectorCollectionSearchDatasetTest extends HazelcastTest { @Setup public void setup() { - DatasetReader reader = DatasetReader.create(datasetUrl, workingDirectory); + DatasetReader reader = DatasetReader.create(datasetUrl, workingDirectory, normalize); + logger.info("Use normalize: {}", normalize); var size = Math.min(reader.getSize(), loadFirst); int dimension = reader.getDimension(); assert dimension == reader.getTestDatasetDimension() : "dataset dimension does not correspond to query vector dimension"; diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java index 84e42dcee3..3f9a76941a 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java @@ -22,4 +22,15 @@ public static Function restoreRealMetric(Metric metric) { default -> jMetric -> -1f; }; } + + public static void normalize(float[] vector) { + double length = 0; + for (float v : vector) { + length += v * v; + } + var scale = (float) (1 / Math.sqrt(length)); + for (int i = 0; i < vector.length; i++) { + vector[i] *= scale; + } + } } diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java index d33da74efc..f103873e10 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java @@ -1,6 +1,7 @@ package com.hazelcast.simulator.tests.vector.readers; import com.hazelcast.simulator.tests.vector.DatasetReader; +import com.hazelcast.simulator.tests.vector.VectorUtils; import com.hazelcast.simulator.tests.vector.model.TestDataset; import io.jhdf.HdfFile; import io.jhdf.api.Dataset; @@ -9,8 +10,8 @@ public class HDF5DatasetReader extends DatasetReader { private static final int BULK_READER_SIZE = 50_000; - public HDF5DatasetReader(String url, String directory) { - super(url, directory); + public HDF5DatasetReader(String url, String directory, Boolean normalizeVector) { + super(url, directory, normalizeVector); } @Override @@ -23,15 +24,24 @@ protected void parseTrainDataset() { trainDataset = getDatasetAsFloatMatrix("train"); size = trainDataset.length; dimension = trainDataset[0].length; + if (normalizeVector) { + for (float[] vector : trainDataset) { + VectorUtils.normalize(vector); + } + } } @Override protected void parseTestDataset() { var searchVectors = getDatasetAsFloatMatrix("test"); - // todo - now scores in reversed order var ids = getDatasetAsIntMatrix("neighbors"); var scores = getDatasetAsFloatMatrix("distances"); testDataset = new TestDataset(searchVectors, ids, scores); + if (normalizeVector) { + for (float[] vector : searchVectors) { + VectorUtils.normalize(vector); + } + } } private float[][] getDatasetAsFloatMatrix(String datasetName) { diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java index b0a3f19819..a0f01caca8 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java @@ -1,8 +1,10 @@ package com.hazelcast.simulator.tests.vector.readers; +import com.google.common.collect.Lists; import com.google.gson.JsonArray; import com.google.gson.JsonParser; import com.hazelcast.simulator.tests.vector.DatasetReader; +import com.hazelcast.simulator.tests.vector.VectorUtils; import com.hazelcast.simulator.tests.vector.model.TestDataset; import org.apache.commons.io.FileUtils; import org.jetbrains.bio.npy.NpyArray; @@ -20,10 +22,8 @@ public class NpyArchiveDatasetReader extends DatasetReader { private Path trainDatasetFilename; private Path testDatesetFilename; - private float[] trainDatasetPlain; - - public NpyArchiveDatasetReader(String url, String directory) { - super(url, directory); + public NpyArchiveDatasetReader(String url, String directory, boolean normalizeVector) { + super(url, directory, normalizeVector); } @Override @@ -51,7 +51,15 @@ protected void parseTrainDataset() { var shape = read.getShape(); size = shape[0]; dimension = shape[1]; - trainDatasetPlain = read.asFloatArray(); + trainDataset = new float[size][]; + var trainDatasetPlain = read.asFloatArray(); + for (int i = 0; i < size; i++) { + var vector = getTrainVectorPlain(i, trainDatasetPlain); + if (normalizeVector) { + VectorUtils.normalize(vector); + } + trainDataset[i] = vector; + } } @Override @@ -69,9 +77,11 @@ protected void parseTestDataset() { var ids = queryObject.getAsJsonArray("closest_ids"); var scores = queryObject.getAsJsonArray("closest_scores"); searchVectors[i] = convertToFloatArray(jsonArray); + if (normalizeVector) { + VectorUtils.normalize(searchVectors[i]); + } searchClosestIds[i] = convertToIntArray(ids); searchClosestScore[i] = convertToFloatArray(scores); - } testDataset = new TestDataset(searchVectors, searchClosestIds, searchClosestScore); } catch (IOException e) { @@ -80,7 +90,6 @@ protected void parseTestDataset() { } - private float[] convertToFloatArray(JsonArray array) { var result = new float[array.size()]; for (int i = 0; i < array.size(); i++) { @@ -97,8 +106,7 @@ private int[] convertToIntArray(JsonArray array) { return result; } - @Override - public float[] getTrainVector(int index) { + private float[] getTrainVectorPlain(int index, float[] trainDatasetPlain) { if (index >= size) { throw new RuntimeException("invalid index"); } diff --git a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java index edca8e00b0..77f8be0008 100644 --- a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java +++ b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java @@ -3,6 +3,8 @@ import com.hazelcast.simulator.tests.vector.DatasetReader; import org.junit.Test; +import java.util.Arrays; + import static org.junit.Assert.assertEquals; import static org.junit.Assert.assertNotNull; @@ -15,10 +17,10 @@ public class DatasetReaderTest { String workingDirectory = "/Users/oshultseva/Downloads/dataset_output/"; @Test - public void npyArchive() { + public void npy_dbpedia() { String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz"; - DatasetReader reader = DatasetReader.create(url, workingDirectory); + DatasetReader reader = DatasetReader.create(url, workingDirectory, true); assertEquals(975_000, reader.getSize()); assertEquals(1536, reader.getDimension()); @@ -26,13 +28,33 @@ public void npyArchive() { assertNotNull(reader.getTrainVector(1234)); assertEquals(5000, reader.getTestDataset().size()); + + assertEquals(0.01739898, reader.getTrainVector(0)[0], 0.0001); + assertEquals(-0.04525524, reader.getTrainVector(0)[1535], 0.0001); } @Test - public void hdf5() { + public void hdf5_angular() { + String url = "http://ann-benchmarks.com/glove-100-angular.hdf5"; + + DatasetReader reader = DatasetReader.create(url, workingDirectory, true); + + assertEquals(1_183_514, reader.getSize()); + assertEquals(100, reader.getDimension()); + assertEquals(100, reader.getTestDatasetDimension()); + + assertEquals(10_000, reader.getTestDataset().size()); + assertEquals(-0.02701984, reader.getTrainVector(0)[0], 0.0001); + assertEquals(-0.00503204, reader.getTrainVector(0)[99], 0.0001); + + assertEquals(0.08828659, reader.getTestDataset().getSearchVector(0)[0], 0.0001); + assertEquals(-0.0329303, reader.getTestDataset().getSearchVector(0)[99], 0.0001); + } + @Test + public void hdf5_960_euclidean() { String url = "http://ann-benchmarks.com/gist-960-euclidean.hdf5"; - DatasetReader reader = DatasetReader.create(url, workingDirectory); + DatasetReader reader = DatasetReader.create(url, workingDirectory, false); assertEquals(1_000_000, reader.getSize()); assertEquals(960, reader.getDimension()); From a1fc7897e2068e7e4270be9122a882291ae8a22c Mon Sep 17 00:00:00 2001 From: oshultseva Date: Mon, 20 May 2024 15:26:07 +0100 Subject: [PATCH 4/8] vector db test --- .../simulator/tests/vector/ScoreMetrics.java | 25 +++- .../VectorCollectionSearchDatasetTest.java | 111 ++++++++++++------ .../simulator/tests/vector/VectorUtils.java | 2 +- .../tests/vector/model/TestDataset.java | 15 +-- .../vector/model/TestDatasetDiffblueTest.java | 36 +----- 5 files changed, 104 insertions(+), 85 deletions(-) diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java index 7919e99d36..cca32c94b4 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java @@ -4,26 +4,39 @@ public class ScoreMetrics { - private static final Histogram scoreHistogram = new Histogram(100, 3); + private String name; - public static void set(int score) { + private final Histogram scoreHistogram = new Histogram(100, 3); + + public ScoreMetrics() { + } + + public void set(int score) { scoreHistogram.recordValue(score); } - public static long getMin() { + public long getMin() { return scoreHistogram.getMinValue(); } - public static long getMax() { + public long getMax() { return scoreHistogram.getMaxValue(); } - public static double getMean() { + public double getMean() { return scoreHistogram.getMean(); } - public static long getPercentLowerThen(int value) { + public double getPercentile(double value) { + return scoreHistogram.getValueAtPercentile(value); + } + + public long getPercentLowerThen(int value) { var lower = scoreHistogram.getCountBetweenValues(0, value); return (lower * 100) / scoreHistogram.getTotalCount(); } + + public void setName(String name) { + this.name = name; + } } diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java index 6aa85a5060..722ccd4cf7 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java @@ -27,13 +27,16 @@ import java.util.Queue; import java.util.concurrent.CompletionStage; import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.atomic.AtomicInteger; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.locks.ReentrantLock; import java.util.function.Function; import static java.util.concurrent.TimeUnit.MILLISECONDS; public class VectorCollectionSearchDatasetTest extends HazelcastTest { + public String name; + public String datasetUrl; public String workingDirectory; @@ -58,35 +61,48 @@ public class VectorCollectionSearchDatasetTest extends HazelcastTest { // inner test parameters - private static final String collectionName = "performance-collection"; - private static final int PUT_BATCH_SIZE = 2_000; private static final int MAX_PUT_ALL_IN_FLIGHT = 5; private VectorCollection collection; - private final AtomicInteger searchCounter = new AtomicInteger(0); - - private final AtomicInteger putCounter = new AtomicInteger(0); - private TestDataset testDataset; private final Queue searchResults = new ConcurrentLinkedQueue<>(); + private final ScoreMetrics scoreMetrics = new ScoreMetrics(); + + private long indexBuildTime = 0; + + private final ReentrantLock lock = new ReentrantLock(); + + private final CountDownLatch setupDone = new CountDownLatch(1); + @Setup public void setup() { + if (!lock.tryLock()) { + try { + setupDone.await(); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + return; + } + // only one thread perform the setup + scoreMetrics.setName(name); DatasetReader reader = DatasetReader.create(datasetUrl, workingDirectory, normalize); - logger.info("Use normalize: {}", normalize); var size = Math.min(reader.getSize(), loadFirst); int dimension = reader.getDimension(); assert dimension == reader.getTestDatasetDimension() : "dataset dimension does not correspond to query vector dimension"; testDataset = reader.getTestDataset(); numberOfSearchIterations = Math.min(numberOfSearchIterations, testDataset.size()); + logger.info("Vector collection name: {}", name); + logger.info("Use normalize: {}", normalize); collection = VectorCollection.getCollection( targetInstance, - new VectorCollectionConfig(collectionName) + new VectorCollectionConfig(name) .addVectorIndexConfig( new VectorIndexConfig() .setMetric(Metric.valueOf(metric)) @@ -96,15 +112,16 @@ public void setup() { ) ); - var start = System.currentTimeMillis(); + var indexBuildTimeStart = System.currentTimeMillis(); Map> buffer = new HashMap<>(); - int index; Pipelining pipelining = new Pipelining<>(MAX_PUT_ALL_IN_FLIGHT); logger.info("Start loading data..."); - while ((index = putCounter.getAndIncrement()) < size) { + int index = 0; + while (index < size) { buffer.put(index, VectorDocument.of(index, VectorValues.of(reader.getTrainVector(index)))); + index++; if (buffer.size() % PUT_BATCH_SIZE == 0) { addToPipelineWithLogging(pipelining, collection.putAllAsync(buffer)); logger.info( @@ -112,7 +129,7 @@ public void setup() { index, size, buffer.size(), - MILLISECONDS.toMinutes(System.currentTimeMillis() - start) + MILLISECONDS.toMinutes(System.currentTimeMillis() - indexBuildTimeStart) ); buffer = new HashMap<>(); } @@ -124,26 +141,26 @@ public void setup() { } logger.info("Start waiting pipeline results..."); - var pipelineWaiting = withTimer(() -> { - try { - pipelining.results(); - } catch (Exception e) { - throw new RuntimeException(e); - } - }); - logger.info("Pipeline waiting finished in {} min", MILLISECONDS.toMinutes(pipelineWaiting)); + try { + pipelining.results(); + } catch (Exception e) { + throw new RuntimeException(e); + } var cleanupTimer = withTimer(() -> collection.optimizeAsync().toCompletableFuture().join()); + indexBuildTime = System.currentTimeMillis() - indexBuildTimeStart; logger.info("Collection size: {}", size); logger.info("Collection dimension: {}", reader.getDimension()); logger.info("Cleanup time (min): {}", MILLISECONDS.toMinutes(cleanupTimer)); - logger.info("Index build time (min): {}", MILLISECONDS.toMinutes(System.currentTimeMillis() - start)); + logger.info("Index build time (min): {}", MILLISECONDS.toMinutes(indexBuildTime)); + + setupDone.countDown(); } @TimeStep() public void search(ThreadState state) { - var iteration = searchCounter.getAndIncrement(); + var iteration = state.getAndIncrementIteration(); if (iteration >= numberOfSearchIterations) { testContext.stop(); return; @@ -160,24 +177,36 @@ public void afterRun() { int index = testSearchResult.index(); List ids = new ArrayList<>(); VectorUtils.forEach(testSearchResult.results, r -> ids.add((Integer) r.getKey())); - ScoreMetrics.set((int) (testDataset.getPrecisionV1(ids, index, limit) * 100)); + scoreMetrics.set((int) (testDataset.getPrecision(ids, index, limit) * 100)); }); - writePureResultsToFile("precision.out"); - logger.info("Number of search iteration: {}", searchCounter.get()); - logger.info("Min score: {}", ScoreMetrics.getMin()); - logger.info("Max score: {}", ScoreMetrics.getMax()); - logger.info("Mean score: {}", ScoreMetrics.getMean()); - logger.info("Percent of results lower then 98% precision: {}", ScoreMetrics.getPercentLowerThen(98)); + writeAllSearchResultsToFile("precision_" + name + ".out"); + appendStatisticsToFile("statistics.out"); + logger.info("Results for {}", name); + logger.info("Min score: {}", scoreMetrics.getMin()); + logger.info("Max score: {}", scoreMetrics.getMax()); + logger.info("Mean score: {}", scoreMetrics.getMean()); + logger.info("5pt: {}", scoreMetrics.getPercentile(5)); + logger.info("10pt: {}", scoreMetrics.getPercentile(10)); + logger.info("The percentage of results with precision lower than 98%: {}", scoreMetrics.getPercentLowerThen(98)); + logger.info("The percentage of results with precision lower than 99%: {}", scoreMetrics.getPercentLowerThen(99)); } public static class ThreadState extends BaseThreadState { + + private int iteration = 0; + + public int getAndIncrementIteration() { + var it = iteration; + iteration++; + return it; + } } public record TestSearchResult(int index, float[] searchVector, SearchResults results) { } - private void writePureResultsToFile(String fileName) { + private void writeAllSearchResultsToFile(String fileName) { try { Function restore = VectorUtils.restoreRealMetric(Metric.valueOf(metric)); var fileWriter = new FileWriter(fileName); @@ -203,6 +232,22 @@ private void writePureResultsToFile(String fileName) { } } + private void appendStatisticsToFile(String fileName) { + try { + FileWriter fileWriter = new FileWriter(fileName, true); + PrintWriter printWriter = new PrintWriter(fileWriter); + List values = List.of( + name, + String.valueOf(indexBuildTime), + String.valueOf(scoreMetrics.getMean()) + ); + printWriter.println(String.join(", ", values)); + printWriter.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + void addToPipelineWithLogging(Pipelining pipelining, CompletionStage asyncInvocation) { var now = System.currentTimeMillis(); try { @@ -228,11 +273,9 @@ private long withTimer(Runnable runnable) { private float getFirstCoordinate(VectorValues vectorValues) { var v = (VectorValues.SingleVectorValues) vectorValues; - if(v == null || v.vector().length == 0) { + if (v == null || v.vector().length == 0) { return 0; } return v.vector()[0]; } - - } diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java index 3f9a76941a..2f6bedafec 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java @@ -18,7 +18,7 @@ public static void forEach(SearchResults searchResults, Consumer c public static Function restoreRealMetric(Metric metric) { return switch (metric) { - case COSINE -> jMetric -> 2 * jMetric - 1; + case COSINE, DOT -> jMetric -> 2 * jMetric - 1; default -> jMetric -> -1f; }; } diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java index bcf9066761..0edc7342dd 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java @@ -34,23 +34,10 @@ public int size() { return searchVectors.length; } - public float getPrecisionV1(List actualVectorsIds, int index, int top) { + public float getPrecision(List actualVectorsIds, int index, int top) { var actualSet = new HashSet<>(actualVectorsIds); var expectedSet = Arrays.stream(Arrays.copyOfRange(closestIds[index], 0, top)).boxed().collect(Collectors.toSet()); actualSet.retainAll(expectedSet); return ((float) actualSet.size()) / top; } - - public float getPrecisionV2(float[] actualVectorsScore, int index) { - var expected = Arrays.copyOfRange(closestScores[index], 0, actualVectorsScore.length); - return distance(actualVectorsScore, expected); - } - - private float distance(float[] array1, float[] array2) { - double sum = 0f; - for (int i = 0; i < array1.length; i++) { - sum += Math.pow((array1[i] - array2[i]), 2.0); - } - return (float) Math.sqrt(sum); - } } diff --git a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java index ec92638fae..71463cfea8 100644 --- a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java +++ b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java @@ -8,19 +8,19 @@ public class TestDatasetDiffblueTest { @Test - public void testGetPrecisionV1_score100() { + public void testGetPrecision_score100() { var actual = ( new TestDataset( new float[][]{new float[]{0f}}, new int[][]{new int[]{1, 2, 3, 4}}, new float[][]{new float[]{0f}} ) - ).getPrecisionV1(List.of(1, 2), 0, 2); + ).getPrecision(List.of(1, 2), 0, 2); assertEquals(1, actual, 0.0f); } @Test - public void testGetPrecisionV1_score0() { + public void testGetPrecision_score0() { assertEquals(0.0f, ( new TestDataset( @@ -28,12 +28,12 @@ public void testGetPrecisionV1_score0() { new int[][]{new int[]{1, 2, 3, 4}, new int[]{1, 2, 1, 2}}, new float[][]{new float[]{0f}} ) - ).getPrecisionV1(List.of(2), 0, 1), + ).getPrecision(List.of(2), 0, 1), 0.0f); } @Test - public void testGetPrecisionV1_score50() { + public void testGetPrecision_score50() { assertEquals(0.5f, ( new TestDataset( @@ -41,31 +41,7 @@ public void testGetPrecisionV1_score50() { new int[][]{new int[]{1, 2, 3, 4}, new int[]{2, 5, 6}}, new float[][]{new float[]{0f}} ) - ).getPrecisionV1(List.of(2, 6), 0, 2), + ).getPrecision(List.of(2, 6), 0, 2), 0.1f); } - - @Test - public void testGetPrecisionV2_theSameVector() { - var actual = ( - new TestDataset( - new float[][]{new float[]{0f}}, - new int[][]{new int[]{1, 2, 3, 4}}, - new float[][]{new float[]{10.0f, 0.5f, 10.0f, 0.5f}} - ) - ).getPrecisionV2(new float[]{10.0f, 0.5f, 10.0f, 0.5f}, 0); - assertEquals(0, actual, 0.0f); - } - - @Test - public void testGetPrecisionV2_DiffrentVector() { - var actual = ( - new TestDataset( - new float[][]{new float[]{0f}}, - new int[][]{new int[]{1, 2, 3, 4}}, - new float[][]{new float[]{10.0f, 0.5f, 10.0f, 0.5f}} - ) - ).getPrecisionV2(new float[]{8.0f, 0.4f, 9.0f, 0.5f}, 0); - assertEquals(Math.sqrt(5.01), actual, 0.1f); - } } From 119d7c6eb909888818806f12c68186a5ac701a92 Mon Sep 17 00:00:00 2001 From: oshultseva Date: Mon, 17 Jun 2024 10:51:24 +0100 Subject: [PATCH 5/8] vector db test --- java/drivers/driver-hazelcast4plus/pom.xml | 5 ++ .../VectorCollectionPutDatasetTest.java | 14 ++--- .../VectorCollectionSearchDatasetTest.java | 51 +++++++++---------- .../simulator/tests/vector/VectorUtils.java | 2 +- 4 files changed, 36 insertions(+), 36 deletions(-) diff --git a/java/drivers/driver-hazelcast4plus/pom.xml b/java/drivers/driver-hazelcast4plus/pom.xml index f5fd1fe80c..8f8fc5de9d 100644 --- a/java/drivers/driver-hazelcast4plus/pom.xml +++ b/java/drivers/driver-hazelcast4plus/pom.xml @@ -172,6 +172,11 @@ 0.6.10 + + org.jctools + jctools-core + 4.0.3 + diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java index bc64d86618..c6ab9c8f08 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java @@ -25,25 +25,24 @@ public class VectorCollectionPutDatasetTest extends HazelcastTest { public String workingDirectory; + public String name; + // common parameters public int loadFirst = Integer.MAX_VALUE; public int putBatchSize = 10_000; // graph parameters - public String metric = "COSINE"; + public String metric; - public int maxDegree = 40; + public int maxDegree; - public int efConstruction = 50; + public int efConstruction; // inner test parameters - private static final String collectionName = "performance-collection"; - private static final TimeMetrics metrics = new TimeMetrics(); private VectorCollection collection; - private final AtomicInteger counter = new AtomicInteger(0); private DatasetReader reader; @@ -57,7 +56,7 @@ public void setup() { collection = VectorCollection.getCollection( targetInstance, - new VectorCollectionConfig(collectionName) + new VectorCollectionConfig(name) .addVectorIndexConfig( new VectorIndexConfig() .setMetric(Metric.valueOf(metric)) @@ -66,6 +65,7 @@ public void setup() { .setEfConstruction(efConstruction) ) ); + logger.info("Use collection with name: {}", collection.getName()); } @TimeStep(prob = 0) diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java index 722ccd4cf7..171d393e06 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java @@ -7,6 +7,7 @@ import com.hazelcast.simulator.hz.HazelcastTest; import com.hazelcast.simulator.test.BaseThreadState; import com.hazelcast.simulator.test.annotations.AfterRun; +import com.hazelcast.simulator.test.annotations.Prepare; import com.hazelcast.simulator.test.annotations.Setup; import com.hazelcast.simulator.test.annotations.TimeStep; import com.hazelcast.simulator.tests.vector.model.TestDataset; @@ -27,8 +28,6 @@ import java.util.Queue; import java.util.concurrent.CompletionStage; import java.util.concurrent.ConcurrentLinkedQueue; -import java.util.concurrent.CountDownLatch; -import java.util.concurrent.locks.ReentrantLock; import java.util.function.Function; import static java.util.concurrent.TimeUnit.MILLISECONDS; @@ -47,17 +46,17 @@ public class VectorCollectionSearchDatasetTest extends HazelcastTest { public int loadFirst = Integer.MAX_VALUE; // graph parameters - public String metric = "COSINE"; + public String metric; - public int maxDegree = 40; + public int maxDegree; - public int efConstruction = 50; + public int efConstruction; public boolean normalize = false; // search parameters - public int limit = 1; + public int limit; // inner test parameters @@ -67,6 +66,8 @@ public class VectorCollectionSearchDatasetTest extends HazelcastTest { private VectorCollection collection; + private DatasetReader reader; + private TestDataset testDataset; private final Queue searchResults = new ConcurrentLinkedQueue<>(); @@ -75,24 +76,12 @@ public class VectorCollectionSearchDatasetTest extends HazelcastTest { private long indexBuildTime = 0; - private final ReentrantLock lock = new ReentrantLock(); - - private final CountDownLatch setupDone = new CountDownLatch(1); @Setup public void setup() { - if (!lock.tryLock()) { - try { - setupDone.await(); - } catch (InterruptedException e) { - throw new RuntimeException(e); - } - return; - } - // only one thread perform the setup scoreMetrics.setName(name); - DatasetReader reader = DatasetReader.create(datasetUrl, workingDirectory, normalize); - var size = Math.min(reader.getSize(), loadFirst); + reader = DatasetReader.create(datasetUrl, workingDirectory, normalize); + int dimension = reader.getDimension(); assert dimension == reader.getTestDatasetDimension() : "dataset dimension does not correspond to query vector dimension"; testDataset = reader.getTestDataset(); @@ -111,6 +100,11 @@ public void setup() { .setEfConstruction(efConstruction) ) ); + } + + @Prepare(global = true) + public void prepare() { + var size = Math.min(reader.getSize(), loadFirst); var indexBuildTimeStart = System.currentTimeMillis(); @@ -154,8 +148,6 @@ public void setup() { logger.info("Collection dimension: {}", reader.getDimension()); logger.info("Cleanup time (min): {}", MILLISECONDS.toMinutes(cleanupTimer)); logger.info("Index build time (min): {}", MILLISECONDS.toMinutes(indexBuildTime)); - - setupDone.countDown(); } @TimeStep() @@ -166,8 +158,11 @@ public void search(ThreadState state) { return; } var vector = testDataset.getSearchVector(iteration); - SearchOptions options = new SearchOptionsBuilder().vector(vector).includePayload().includeVectors().limit(limit).build(); - var result = collection.searchAsync(options).toCompletableFuture().join(); + SearchOptions options = new SearchOptionsBuilder().includeValue().includeVectors().limit(limit).build(); + var result = collection.searchAsync( + VectorValues.of(vector), + options + ).toCompletableFuture().join(); searchResults.add(new TestSearchResult(iteration, vector, result)); } @@ -181,7 +176,7 @@ public void afterRun() { }); writeAllSearchResultsToFile("precision_" + name + ".out"); - appendStatisticsToFile("statistics.out"); + appendStatisticsToFile(); logger.info("Results for {}", name); logger.info("Min score: {}", scoreMetrics.getMin()); logger.info("Max score: {}", scoreMetrics.getMax()); @@ -203,7 +198,7 @@ public int getAndIncrementIteration() { } } - public record TestSearchResult(int index, float[] searchVector, SearchResults results) { + public record TestSearchResult(int index, float[] searchVector, SearchResults results) { } private void writeAllSearchResultsToFile(String fileName) { @@ -232,9 +227,9 @@ private void writeAllSearchResultsToFile(String fileName) { } } - private void appendStatisticsToFile(String fileName) { + private void appendStatisticsToFile() { try { - FileWriter fileWriter = new FileWriter(fileName, true); + FileWriter fileWriter = new FileWriter("statistics.out", true); PrintWriter printWriter = new PrintWriter(fileWriter); List values = List.of( name, diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java index 2f6bedafec..ea46e3e0da 100644 --- a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java @@ -9,7 +9,7 @@ public class VectorUtils { - public static void forEach(SearchResults searchResults, Consumer consumer) { + public static void forEach(SearchResults searchResults, Consumer> consumer) { var resultsIterator = searchResults.results(); while (resultsIterator.hasNext()) { consumer.accept(resultsIterator.next()); From 403c538878b23878d4c31be3be48ed8393c4b8fa Mon Sep 17 00:00:00 2001 From: oshultseva Date: Mon, 17 Jun 2024 11:06:53 +0100 Subject: [PATCH 6/8] vector db test --- .../vector-test/v_dbpedia_openai_1M_all.yaml | 188 ++++++++++++++++++ 1 file changed, 188 insertions(+) create mode 100644 vector-search-simulator/vector-test/v_dbpedia_openai_1M_all.yaml diff --git a/vector-search-simulator/vector-test/v_dbpedia_openai_1M_all.yaml b/vector-search-simulator/vector-test/v_dbpedia_openai_1M_all.yaml new file mode 100644 index 0000000000..6fead71013 --- /dev/null +++ b/vector-search-simulator/vector-test/v_dbpedia_openai_1M_all.yaml @@ -0,0 +1,188 @@ +- name: dbpedia_openai_1M + duration: 60m + repetitions: 1 + clients: 1 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.5.0-SNAPSHOT + client_args: > + -Xms30g + -Xmx30g + --add-modules jdk.incubator.vector + --enable-preview + --enable-native-access=ALL-UNNAMED + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + member_args: > + -Xms60g + -Xmx60g + --add-modules jdk.incubator.vector + --enable-preview + --enable-native-access=ALL-UNNAMED + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: [LICENCE] + parallel: False + test: + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_16_ef_128 + threadCount: 1 + # ratePerSecond: 5_000 + # interval: 100us + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 16 + efConstruction: 128 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_32_ef_128 + threadCount: 1 + # ratePerSecond: 5_000 + # interval: 100us + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 32 + efConstruction: 128 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_64_ef_128 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 64 + efConstruction: 128 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_16_ef_256 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 16 + efConstruction: 256 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_32_ef_256 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 32 + efConstruction: 256 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_64_ef_256 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 64 + efConstruction: 256 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_16_ef_512 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 16 + efConstruction: 512 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_32_ef_512 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 32 + efConstruction: 512 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_64_ef_512 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 64 + efConstruction: 512 + normalize: false + # search params + limit: 10 \ No newline at end of file From 0b0f294e3a87a50281799b3766d5bced9faa041a Mon Sep 17 00:00:00 2001 From: Eugene Abramchuk Date: Thu, 11 Jul 2024 17:45:21 +0200 Subject: [PATCH 7/8] update repositories and fix rsync for OSX clients --- bin/hidden/agent | 2 +- java/drivers/driver-hazelcast4plus/conf/install.py | 2 +- java/drivers/driver-hazelcast4plus/pom.xml | 2 +- java/pom.xml | 13 +++++++++---- 4 files changed, 12 insertions(+), 7 deletions(-) diff --git a/bin/hidden/agent b/bin/hidden/agent index 1962fc6f61..71d51f820b 100755 --- a/bin/hidden/agent +++ b/bin/hidden/agent @@ -5,7 +5,7 @@ if [ -f "agent.pid" ] ; then fi if [ -z "${SIMULATOR_HOME}" ] ; then - export SIMULATOR_HOME=$(cd $(dirname $(readlink -f $0 2> /dev/null || readlink $0 2> /dev/null || echo $0))/../.. && pwd) + export SIMULATOR_HOME=$(cd $(dirname $(readlink -f $0 2> /dev/null || readlink -f $0 2> /dev/null || echo $0))/../.. && pwd) fi export JAVA_OPTS="-server -Xmx2g -Xms512m -XX:+HeapDumpOnOutOfMemoryError ${JAVA_EXTRA_OPTS}" diff --git a/java/drivers/driver-hazelcast4plus/conf/install.py b/java/drivers/driver-hazelcast4plus/conf/install.py index bf16fb929f..fc55b8c0e1 100644 --- a/java/drivers/driver-hazelcast4plus/conf/install.py +++ b/java/drivers/driver-hazelcast4plus/conf/install.py @@ -69,7 +69,7 @@ def _get_remote_repo(is_enterprise:bool, version:str): return "https://oss.sonatype.org/content/repositories/releases" else: if version.endswith("-SNAPSHOT"): - return "https://repository.hazelcast.com/snapshot" + return "https://repository.hazelcast.com/snapshot-internal" else: return "https://repository.hazelcast.com/release" diff --git a/java/drivers/driver-hazelcast4plus/pom.xml b/java/drivers/driver-hazelcast4plus/pom.xml index 8f8fc5de9d..b6b1a19ccf 100644 --- a/java/drivers/driver-hazelcast4plus/pom.xml +++ b/java/drivers/driver-hazelcast4plus/pom.xml @@ -73,7 +73,7 @@ com.hazelcast - hazelcast-enterprise-vector + hazelcast-enterprise ${hazelcast.version} diff --git a/java/pom.xml b/java/pom.xml index 8884a8feea..b6dad381af 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,11 +21,16 @@ Backup Repository https://repo2.maven.org/maven2/ - - snapshot-repository - Hazelcast Snapshot Repository - https://oss.sonatype.org/content/repositories/snapshots + snapshot-internal + Hazelcast Internal Snapshots + https://repository.hazelcast.com/snapshot-internal/ + + false + + + true + From d0e858c004d37aa51f4cf950dfa1fbaf8297eb13 Mon Sep 17 00:00:00 2001 From: Eugene Abramchuk Date: Tue, 16 Jul 2024 17:09:32 +0200 Subject: [PATCH 8/8] fix failing private snapshot repository lookup and update README.md --- README.md | 5 +++++ java/drivers/driver-hazelcast4plus/conf/install.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index fb4655799b..6892928d87 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,11 @@ To install Java on the remote machines call: inventory install java ``` +Note that starting from 5.5.0-SNAPSHOT onwards, the private snapshots repository is used, so access to this repository +needs to be configured both locally (where simulator is being built and perftest is kicked off) and remotely (on load +generators and so on...). +See [here](https://hazelcast.atlassian.net/wiki/spaces/DI/pages/5116493883/How+to+access+private+snapshot+maven+artifacts+and+docker+images) for details. + You can pass a custom URL to configure the correct JVM. To get a listing of examples URL's call: ```shell diff --git a/java/drivers/driver-hazelcast4plus/conf/install.py b/java/drivers/driver-hazelcast4plus/conf/install.py index fc55b8c0e1..c6ab6045ad 100644 --- a/java/drivers/driver-hazelcast4plus/conf/install.py +++ b/java/drivers/driver-hazelcast4plus/conf/install.py @@ -69,7 +69,9 @@ def _get_remote_repo(is_enterprise:bool, version:str): return "https://oss.sonatype.org/content/repositories/releases" else: if version.endswith("-SNAPSHOT"): - return "https://repository.hazelcast.com/snapshot-internal" + # maven ignores settings.xml authentication unless forced with fully qualified remoteRepositories construct + # https://maven.apache.org/plugins/maven-dependency-plugin/get-mojo.html + return "snapshot-internal::::https://repository.hazelcast.com/snapshot-internal" else: return "https://repository.hazelcast.com/release"