diff --git a/README.md b/README.md index 3181ae88f8..6b477e3045 100644 --- a/README.md +++ b/README.md @@ -195,6 +195,11 @@ To install Java on the remote machines call: inventory install java ``` +Note that starting from 5.5.0-SNAPSHOT onwards, the private snapshots repository is used, so access to this repository +needs to be configured both locally (where simulator is being built and perftest is kicked off) and remotely (on load +generators and so on...). +See [here](https://hazelcast.atlassian.net/wiki/spaces/DI/pages/5116493883/How+to+access+private+snapshot+maven+artifacts+and+docker+images) for details. + You can pass a custom URL to configure the correct JVM. To get a listing of examples URL's call: ```shell diff --git a/bin/hidden/agent b/bin/hidden/agent index 1962fc6f61..71d51f820b 100755 --- a/bin/hidden/agent +++ b/bin/hidden/agent @@ -5,7 +5,7 @@ if [ -f "agent.pid" ] ; then fi if [ -z "${SIMULATOR_HOME}" ] ; then - export SIMULATOR_HOME=$(cd $(dirname $(readlink -f $0 2> /dev/null || readlink $0 2> /dev/null || echo $0))/../.. && pwd) + export SIMULATOR_HOME=$(cd $(dirname $(readlink -f $0 2> /dev/null || readlink -f $0 2> /dev/null || echo $0))/../.. && pwd) fi export JAVA_OPTS="-server -Xmx2g -Xms512m -XX:+HeapDumpOnOutOfMemoryError ${JAVA_EXTRA_OPTS}" diff --git a/java/drivers/driver-hazelcast4plus/conf/install.py b/java/drivers/driver-hazelcast4plus/conf/install.py index 329233a352..37b4ee8b11 100644 --- a/java/drivers/driver-hazelcast4plus/conf/install.py +++ b/java/drivers/driver-hazelcast4plus/conf/install.py @@ -69,7 +69,9 @@ def _get_remote_repo(is_enterprise:bool, version:str): return "https://oss.sonatype.org/content/repositories/releases" else: if version.endswith("-SNAPSHOT"): - return "https://repository.hazelcast.com/snapshot" + # maven ignores settings.xml authentication unless forced with fully qualified remoteRepositories construct + # https://maven.apache.org/plugins/maven-dependency-plugin/get-mojo.html + return "snapshot-internal::::https://repository.hazelcast.com/snapshot-internal" else: return "https://repository.hazelcast.com/release" diff --git a/java/drivers/driver-hazelcast4plus/pom.xml b/java/drivers/driver-hazelcast4plus/pom.xml index a6ab8e0055..3d6da38258 100644 --- a/java/drivers/driver-hazelcast4plus/pom.xml +++ b/java/drivers/driver-hazelcast4plus/pom.xml @@ -13,7 +13,7 @@ - 5.4.0 + 5.5.0 ${project.parent.basedir} 4.1.94.Final 2.0.34.Final @@ -71,6 +71,12 @@ ${hazelcast.version} + + com.hazelcast + hazelcast-enterprise + ${hazelcast.version} + + javax.cache cache-api @@ -136,5 +142,41 @@ ${netty.version} + + org.apache.commons + commons-lang3 + 3.14.0 + + + + org.apache.httpcomponents + httpclient + ${httpclient.version} + + + + ai.djl + api + 0.27.0 + + + + org.jetbrains.bio + npy + 0.3.5 + + + + io.jhdf + jhdf + 0.6.10 + + + + org.jctools + jctools-core + 4.0.3 + + diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java new file mode 100644 index 0000000000..9beff2c095 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java @@ -0,0 +1,141 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.simulator.tests.vector.model.TestDataset; +import com.hazelcast.simulator.tests.vector.readers.HDF5DatasetReader; +import com.hazelcast.simulator.tests.vector.readers.NpyArchiveDatasetReader; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.commons.io.IOUtils; +import org.apache.http.HttpResponse; +import org.apache.http.client.ResponseHandler; +import org.apache.http.client.methods.HttpGet; +import org.apache.http.impl.client.CloseableHttpClient; +import org.apache.http.impl.client.HttpClients; +import org.apache.http.impl.client.LaxRedirectStrategy; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.io.File; +import java.io.IOException; +import java.io.InputStream; +import java.net.MalformedURLException; +import java.net.URI; +import java.net.URL; +import java.nio.file.Path; + +public abstract class DatasetReader { + + private final URL datasetURL; + + protected final Path workingDirectory; + + protected final File downloadedFile; + + protected float[][] trainDataset; + + protected TestDataset testDataset; + + protected int dimension; + + protected int size; + + protected Boolean normalizeVector = false; + + protected final Logger logger = LogManager.getLogger(getClass()); + + public DatasetReader(String url, String directory) { + this(url, directory, false); + } + public DatasetReader(String url, String directory, Boolean normalizeVector) { + try { + this.datasetURL = URI.create(url).toURL(); + this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile())); + this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile(); + this.normalizeVector = normalizeVector; + + logger.info("Start downloading file from {}", datasetURL); + if (!downloadedFile.exists()) { + download(); + } + logger.info("File downloaded to {}. Start unpacking...", downloadedFile); + + preprocessDatasetFile(); + parseTrainDataset(); + parseTestDataset(); + logger.info("Dataset reader is initialized"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + protected abstract void preprocessDatasetFile(); + protected abstract void parseTrainDataset(); + protected abstract void parseTestDataset(); + + private void cleanup() { + try { + FileUtils.cleanDirectory(workingDirectory.toFile()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public float[] getTrainVector(int index) { + return trainDataset[index]; + } + + public TestDataset getTestDataset() { + return testDataset; + } + + public int getDimension() { + return dimension; + } + + public int getSize() { + return size; + } + + public int getTestDatasetDimension() { + return testDataset.getDimension(); + } + + private void download() { + CloseableHttpClient httpClient = HttpClients.custom() + .setRedirectStrategy(new LaxRedirectStrategy()) + .build(); + try { + HttpGet get = new HttpGet(datasetURL.toURI()); + httpClient.execute(get, new FileDownloadResponseHandler(downloadedFile)); + } catch (Exception e) { + throw new IllegalStateException(e); + } finally { + IOUtils.closeQuietly(httpClient); + } + } + + + private record FileDownloadResponseHandler(File target) implements ResponseHandler { + + @Override + public Void handleResponse(HttpResponse response) throws IOException { + InputStream source = response.getEntity().getContent(); + FileUtils.copyInputStreamToFile(source, this.target); + return null; + } + } + + public static DatasetReader create(String url, String directory, boolean normalizeVector) { + try { + URL datasetUrl = URI.create(url).toURL(); + var ext = FilenameUtils.getExtension(datasetUrl.getFile()); + return switch (ext) { + case "hdf5" -> new HDF5DatasetReader(url, directory, normalizeVector); + case "tgz" -> new NpyArchiveDatasetReader(url, directory, normalizeVector); + default -> throw new UnsupportedOperationException("File " + ext + " is not supported"); + }; + } catch (MalformedURLException e) { + throw new RuntimeException(e); + } + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/NpyDatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/NpyDatasetReader.java new file mode 100644 index 0000000000..9c908abedc --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/NpyDatasetReader.java @@ -0,0 +1,149 @@ +package com.hazelcast.simulator.tests.vector; + +import com.google.gson.JsonArray; +import com.google.gson.JsonParser; +import com.hazelcast.simulator.tests.vector.readers.TarExtractor; +import org.apache.commons.io.FileUtils; +import org.apache.commons.io.FilenameUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.jetbrains.bio.npy.NpyArray; +import org.jetbrains.bio.npy.NpyFile; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.net.URI; +import java.net.URL; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; + +@Deprecated +public class NpyDatasetReader { + + private final URL datasetURL; + + private final Path workingDirectory; + + private final File downloadedFile; + + private final Path vectorsNameFile; + private final Path testNameFile; + + private float[] vectorsPlain; + + private List query; + + private int dimension; + private int size; + + protected final Logger logger = LogManager.getLogger(getClass()); + + public NpyDatasetReader(String url, String directory) { + try { + this.datasetURL = URI.create(url).toURL(); + this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile())); + this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile(); + this.vectorsNameFile = Path.of(workingDirectory.toString(), "vectors.npy"); + this.testNameFile = Path.of(workingDirectory.toString(), "tests.jsonl"); + + logger.info("Start downloading file from " + datasetURL); + if (!downloadedFile.exists()) { + download(); + } + logger.info("File downloaded to " + downloadedFile + ". Start unpacking..."); + + if (!vectorsNameFile.toFile().exists()) { + unpack(); + } + logger.info("Unpacking finished. Start parse vectors..."); + + parseArray(); + parseTestCases(); + logger.info("Dataset reader is initialized"); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void download() { + try { + FileUtils.copyURLToFile( + datasetURL, + downloadedFile, + 120_000, + 60_000 * 60); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void unpack() throws IOException { + TarExtractor.extractTarGZ(new FileInputStream(downloadedFile), workingDirectory); + } + + private void parseArray() { + NpyArray read = NpyFile.read(vectorsNameFile, Integer.MAX_VALUE); + var shape = read.getShape(); + size = shape[0]; + dimension = shape[1]; + vectorsPlain = read.asFloatArray(); + } + + private void parseTestCases() { + try { + var parser = new JsonParser(); + List queryList = FileUtils.readLines(testNameFile.toFile(), Charset.defaultCharset()); + query = queryList.stream() + .map(query -> parser.parse(query).getAsJsonObject().getAsJsonArray("query")) + .map(this::convert) + .toList(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private float[] convert(JsonArray array) { + var result = new float[array.size()]; + for (int i = 0; i < array.size(); i++) { + result[i] = array.get(i).getAsFloat(); + } + return result; + } + + private void cleanup() { + try { + FileUtils.cleanDirectory(workingDirectory.toFile()); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + public float[] read(int index) { + if (index >= size) { + throw new RuntimeException("invalid index"); + } + return Arrays.copyOfRange(vectorsPlain, index * dimension, (index + 1) * dimension); + } + + public List getTestCases() { + return query; + } + + public int getDimension() { + return dimension; + } + + public int getSize() { + return size; + } + + public int getQueryDimension() { + if(query.isEmpty()) { + return 0; + } + return query.get(0).length; + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java new file mode 100644 index 0000000000..cca32c94b4 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/ScoreMetrics.java @@ -0,0 +1,42 @@ +package com.hazelcast.simulator.tests.vector; + +import org.HdrHistogram.Histogram; + +public class ScoreMetrics { + + private String name; + + private final Histogram scoreHistogram = new Histogram(100, 3); + + public ScoreMetrics() { + } + + public void set(int score) { + scoreHistogram.recordValue(score); + } + + public long getMin() { + return scoreHistogram.getMinValue(); + } + + public long getMax() { + return scoreHistogram.getMaxValue(); + } + + public double getMean() { + return scoreHistogram.getMean(); + } + + public double getPercentile(double value) { + return scoreHistogram.getValueAtPercentile(value); + } + + public long getPercentLowerThen(int value) { + var lower = scoreHistogram.getCountBetweenValues(0, value); + return (lower * 100) / scoreHistogram.getTotalCount(); + } + + public void setName(String name) { + this.name = name; + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java new file mode 100644 index 0000000000..c6ab9c8f08 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionPutDatasetTest.java @@ -0,0 +1,155 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.config.vector.Metric; +import com.hazelcast.config.vector.VectorCollectionConfig; +import com.hazelcast.config.vector.VectorIndexConfig; +import com.hazelcast.simulator.hz.HazelcastTest; +import com.hazelcast.simulator.test.BaseThreadState; +import com.hazelcast.simulator.test.annotations.AfterRun; +import com.hazelcast.simulator.test.annotations.Setup; +import com.hazelcast.simulator.test.annotations.TimeStep; +import com.hazelcast.vector.VectorCollection; +import com.hazelcast.vector.VectorDocument; +import com.hazelcast.vector.VectorValues; +import org.HdrHistogram.Histogram; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.concurrent.atomic.AtomicInteger; + +public class VectorCollectionPutDatasetTest extends HazelcastTest { + + public String datasetUrl; + + public String workingDirectory; + + public String name; + + // common parameters + public int loadFirst = Integer.MAX_VALUE; + + public int putBatchSize = 10_000; + + // graph parameters + public String metric; + + public int maxDegree; + + public int efConstruction; + + // inner test parameters + + private static final TimeMetrics metrics = new TimeMetrics(); + private VectorCollection collection; + private final AtomicInteger counter = new AtomicInteger(0); + + private DatasetReader reader; + private List>> buffers = new ArrayList<>(); + + @Setup + public void setup() { + reader = DatasetReader.create(datasetUrl, workingDirectory, false); + int dimension = reader.getDimension(); + loadFirst = Math.min(loadFirst, reader.getSize()); + + collection = VectorCollection.getCollection( + targetInstance, + new VectorCollectionConfig(name) + .addVectorIndexConfig( + new VectorIndexConfig() + .setMetric(Metric.valueOf(metric)) + .setDimension(dimension) + .setMaxDegree(maxDegree) + .setEfConstruction(efConstruction) + ) + ); + logger.info("Use collection with name: {}", collection.getName()); + } + + @TimeStep(prob = 0) + public void put(ThreadState state) { + var iteration = counter.getAndIncrement(); + if (iteration >= loadFirst) { + testContext.stop(); + return; + } + var vector = reader.getTrainVector(iteration); + metrics.recordPut( + () -> collection.putAsync( + iteration, + VectorDocument.of(iteration, VectorValues.of(vector)) + ) + .toCompletableFuture() + .join() + ); + } + + @TimeStep(prob = 1) + public void putAll(ThreadState state) { + var iteration = counter.getAndAdd(putBatchSize); + if (iteration >= loadFirst) { + testContext.stop(); + return; + } + Map> buffer = new HashMap<>(); + metrics.recordBuffer( + () -> { + for (int i = 0; i < putBatchSize; i++) { + var key = iteration + i; + if (key >= reader.size) { + break; + } + var vector = reader.getTrainVector(key); + buffer.put(key, VectorDocument.of(key, VectorValues.of(vector))); + } + } + ); + + metrics.recordPut( + () -> collection.putAllAsync(buffer) + .toCompletableFuture() + .join() + ); + } + + @AfterRun + public void afterRun() { + logger.info("****CUSTOM STATISTICS****"); + logger.info(metrics.getStatistics()); + } + + public static class ThreadState extends BaseThreadState { + } + + + public static class TimeMetrics { + private static final Histogram bufferTimer = new Histogram(2); + private static final Histogram putTimer = new Histogram(2); + + + public void recordBuffer(Runnable action) { + var start = System.currentTimeMillis(); + action.run(); + bufferTimer.recordValue(System.currentTimeMillis() - start); + } + + public void recordPut(Runnable action) { + var start = System.currentTimeMillis(); + action.run(); + putTimer.recordValue(System.currentTimeMillis() - start); + } + + public String getStatistics() { + return "\nBuffer 95p: " + bufferTimer.getValueAtPercentile(95) + "\n" + + "Buffer max: " + bufferTimer.getMaxValue() + "\n" + + "Buffer min: " + bufferTimer.getMinValue() + "\n" + + "Buffer mean: " + bufferTimer.getMean() + "\n" + + "Put 95p: " + putTimer.getValueAtPercentile(95) + "\n" + + "Put max: " + putTimer.getMaxValue() + "\n" + + "Put min: " + putTimer.getMinValue() + "\n" + + "Put mean: " + putTimer.getMean() + "\n"; + } + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java new file mode 100644 index 0000000000..171d393e06 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorCollectionSearchDatasetTest.java @@ -0,0 +1,276 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.config.vector.Metric; +import com.hazelcast.config.vector.VectorCollectionConfig; +import com.hazelcast.config.vector.VectorIndexConfig; +import com.hazelcast.core.Pipelining; +import com.hazelcast.simulator.hz.HazelcastTest; +import com.hazelcast.simulator.test.BaseThreadState; +import com.hazelcast.simulator.test.annotations.AfterRun; +import com.hazelcast.simulator.test.annotations.Prepare; +import com.hazelcast.simulator.test.annotations.Setup; +import com.hazelcast.simulator.test.annotations.TimeStep; +import com.hazelcast.simulator.tests.vector.model.TestDataset; +import com.hazelcast.vector.SearchOptions; +import com.hazelcast.vector.SearchOptionsBuilder; +import com.hazelcast.vector.SearchResults; +import com.hazelcast.vector.VectorCollection; +import com.hazelcast.vector.VectorDocument; +import com.hazelcast.vector.VectorValues; + +import java.io.FileWriter; +import java.io.IOException; +import java.io.PrintWriter; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.Queue; +import java.util.concurrent.CompletionStage; +import java.util.concurrent.ConcurrentLinkedQueue; +import java.util.function.Function; + +import static java.util.concurrent.TimeUnit.MILLISECONDS; + +public class VectorCollectionSearchDatasetTest extends HazelcastTest { + + public String name; + + public String datasetUrl; + + public String workingDirectory; + + // common parameters + public int numberOfSearchIterations = Integer.MAX_VALUE; + + public int loadFirst = Integer.MAX_VALUE; + + // graph parameters + public String metric; + + public int maxDegree; + + public int efConstruction; + + public boolean normalize = false; + + // search parameters + + public int limit; + + // inner test parameters + + private static final int PUT_BATCH_SIZE = 2_000; + + private static final int MAX_PUT_ALL_IN_FLIGHT = 5; + + private VectorCollection collection; + + private DatasetReader reader; + + private TestDataset testDataset; + + private final Queue searchResults = new ConcurrentLinkedQueue<>(); + + private final ScoreMetrics scoreMetrics = new ScoreMetrics(); + + private long indexBuildTime = 0; + + + @Setup + public void setup() { + scoreMetrics.setName(name); + reader = DatasetReader.create(datasetUrl, workingDirectory, normalize); + + int dimension = reader.getDimension(); + assert dimension == reader.getTestDatasetDimension() : "dataset dimension does not correspond to query vector dimension"; + testDataset = reader.getTestDataset(); + numberOfSearchIterations = Math.min(numberOfSearchIterations, testDataset.size()); + + logger.info("Vector collection name: {}", name); + logger.info("Use normalize: {}", normalize); + collection = VectorCollection.getCollection( + targetInstance, + new VectorCollectionConfig(name) + .addVectorIndexConfig( + new VectorIndexConfig() + .setMetric(Metric.valueOf(metric)) + .setDimension(dimension) + .setMaxDegree(maxDegree) + .setEfConstruction(efConstruction) + ) + ); + } + + @Prepare(global = true) + public void prepare() { + var size = Math.min(reader.getSize(), loadFirst); + + var indexBuildTimeStart = System.currentTimeMillis(); + + Map> buffer = new HashMap<>(); + Pipelining pipelining = new Pipelining<>(MAX_PUT_ALL_IN_FLIGHT); + logger.info("Start loading data..."); + + int index = 0; + while (index < size) { + buffer.put(index, VectorDocument.of(index, VectorValues.of(reader.getTrainVector(index)))); + index++; + if (buffer.size() % PUT_BATCH_SIZE == 0) { + addToPipelineWithLogging(pipelining, collection.putAllAsync(buffer)); + logger.info( + "Uploaded {} vectors from {}. Block size: {}. Total time (min): {}", + index, + size, + buffer.size(), + MILLISECONDS.toMinutes(System.currentTimeMillis() - indexBuildTimeStart) + ); + buffer = new HashMap<>(); + } + } + if (!buffer.isEmpty()) { + addToPipelineWithLogging(pipelining, collection.putAllAsync(buffer)); + logger.info("Uploading last vectors block. Last block size: {}.", buffer.size()); + buffer.clear(); + } + + logger.info("Start waiting pipeline results..."); + try { + pipelining.results(); + } catch (Exception e) { + throw new RuntimeException(e); + } + + var cleanupTimer = withTimer(() -> collection.optimizeAsync().toCompletableFuture().join()); + indexBuildTime = System.currentTimeMillis() - indexBuildTimeStart; + + logger.info("Collection size: {}", size); + logger.info("Collection dimension: {}", reader.getDimension()); + logger.info("Cleanup time (min): {}", MILLISECONDS.toMinutes(cleanupTimer)); + logger.info("Index build time (min): {}", MILLISECONDS.toMinutes(indexBuildTime)); + } + + @TimeStep() + public void search(ThreadState state) { + var iteration = state.getAndIncrementIteration(); + if (iteration >= numberOfSearchIterations) { + testContext.stop(); + return; + } + var vector = testDataset.getSearchVector(iteration); + SearchOptions options = new SearchOptionsBuilder().includeValue().includeVectors().limit(limit).build(); + var result = collection.searchAsync( + VectorValues.of(vector), + options + ).toCompletableFuture().join(); + searchResults.add(new TestSearchResult(iteration, vector, result)); + } + + @AfterRun + public void afterRun() { + searchResults.forEach(testSearchResult -> { + int index = testSearchResult.index(); + List ids = new ArrayList<>(); + VectorUtils.forEach(testSearchResult.results, r -> ids.add((Integer) r.getKey())); + scoreMetrics.set((int) (testDataset.getPrecision(ids, index, limit) * 100)); + }); + + writeAllSearchResultsToFile("precision_" + name + ".out"); + appendStatisticsToFile(); + logger.info("Results for {}", name); + logger.info("Min score: {}", scoreMetrics.getMin()); + logger.info("Max score: {}", scoreMetrics.getMax()); + logger.info("Mean score: {}", scoreMetrics.getMean()); + logger.info("5pt: {}", scoreMetrics.getPercentile(5)); + logger.info("10pt: {}", scoreMetrics.getPercentile(10)); + logger.info("The percentage of results with precision lower than 98%: {}", scoreMetrics.getPercentLowerThen(98)); + logger.info("The percentage of results with precision lower than 99%: {}", scoreMetrics.getPercentLowerThen(99)); + } + + public static class ThreadState extends BaseThreadState { + + private int iteration = 0; + + public int getAndIncrementIteration() { + var it = iteration; + iteration++; + return it; + } + } + + public record TestSearchResult(int index, float[] searchVector, SearchResults results) { + } + + private void writeAllSearchResultsToFile(String fileName) { + try { + Function restore = VectorUtils.restoreRealMetric(Metric.valueOf(metric)); + var fileWriter = new FileWriter(fileName); + PrintWriter printWriter = new PrintWriter(fileWriter); + printWriter.println("index, searchVector0, foundVector0, foundVectorKey, foundVectorScore, restoredRealVectorScore"); + searchResults.forEach( + testSearchResult -> VectorUtils.forEach( + testSearchResult.results, + (result) -> printWriter.printf( + "%d, %s, %s, %s, %s, %s\n", + testSearchResult.index, + testSearchResult.searchVector[0], + getFirstCoordinate(result.getVectors()), + result.getKey(), + result.getScore(), + restore.apply(result.getScore()) + ) + ) + ); + printWriter.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + private void appendStatisticsToFile() { + try { + FileWriter fileWriter = new FileWriter("statistics.out", true); + PrintWriter printWriter = new PrintWriter(fileWriter); + List values = List.of( + name, + String.valueOf(indexBuildTime), + String.valueOf(scoreMetrics.getMean()) + ); + printWriter.println(String.join(", ", values)); + printWriter.close(); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + void addToPipelineWithLogging(Pipelining pipelining, CompletionStage asyncInvocation) { + var now = System.currentTimeMillis(); + try { + pipelining.add(asyncInvocation); + } catch (InterruptedException e) { + throw new RuntimeException(e); + } + var msBlocked = System.currentTimeMillis() - now; + // log if we were blocked for more than 30 sec + if (msBlocked > 30_000) { + logger.info( + "Thread was blocked for {} sec due to reaching max pipeline depth", + MILLISECONDS.toSeconds(msBlocked) + ); + } + } + + private long withTimer(Runnable runnable) { + var start = System.currentTimeMillis(); + runnable.run(); + return System.currentTimeMillis() - start; + } + + private float getFirstCoordinate(VectorValues vectorValues) { + var v = (VectorValues.SingleVectorValues) vectorValues; + if (v == null || v.vector().length == 0) { + return 0; + } + return v.vector()[0]; + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java new file mode 100644 index 0000000000..ea46e3e0da --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/VectorUtils.java @@ -0,0 +1,36 @@ +package com.hazelcast.simulator.tests.vector; + +import com.hazelcast.vector.SearchResult; +import com.hazelcast.vector.SearchResults; +import com.hazelcast.config.vector.Metric; + +import java.util.function.Consumer; +import java.util.function.Function; + +public class VectorUtils { + + public static void forEach(SearchResults searchResults, Consumer> consumer) { + var resultsIterator = searchResults.results(); + while (resultsIterator.hasNext()) { + consumer.accept(resultsIterator.next()); + } + } + + public static Function restoreRealMetric(Metric metric) { + return switch (metric) { + case COSINE, DOT -> jMetric -> 2 * jMetric - 1; + default -> jMetric -> -1f; + }; + } + + public static void normalize(float[] vector) { + double length = 0; + for (float v : vector) { + length += v * v; + } + var scale = (float) (1 / Math.sqrt(length)); + for (int i = 0; i < vector.length; i++) { + vector[i] *= scale; + } + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java new file mode 100644 index 0000000000..0edc7342dd --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/model/TestDataset.java @@ -0,0 +1,43 @@ +package com.hazelcast.simulator.tests.vector.model; + +import java.util.Arrays; +import java.util.HashSet; +import java.util.List; +import java.util.stream.Collectors; + +public class TestDataset { + + private final float[][] searchVectors; + + private final int[][] closestIds; + + private final float[][] closestScores; + + public TestDataset(float[][] searchVector, int[][] closestIds, float[][] closestScores) { + this.searchVectors = searchVector; + this.closestIds = closestIds; + this.closestScores = closestScores; + } + + public float[] getSearchVector(int index) { + return searchVectors[index]; + } + + public int getDimension() { + if(searchVectors.length == 0) { + return 0; + } + return searchVectors[0].length; + } + + public int size() { + return searchVectors.length; + } + + public float getPrecision(List actualVectorsIds, int index, int top) { + var actualSet = new HashSet<>(actualVectorsIds); + var expectedSet = Arrays.stream(Arrays.copyOfRange(closestIds[index], 0, top)).boxed().collect(Collectors.toSet()); + actualSet.retainAll(expectedSet); + return ((float) actualSet.size()) / top; + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java new file mode 100644 index 0000000000..f103873e10 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/HDF5DatasetReader.java @@ -0,0 +1,83 @@ +package com.hazelcast.simulator.tests.vector.readers; + +import com.hazelcast.simulator.tests.vector.DatasetReader; +import com.hazelcast.simulator.tests.vector.VectorUtils; +import com.hazelcast.simulator.tests.vector.model.TestDataset; +import io.jhdf.HdfFile; +import io.jhdf.api.Dataset; + +public class HDF5DatasetReader extends DatasetReader { + + private static final int BULK_READER_SIZE = 50_000; + + public HDF5DatasetReader(String url, String directory, Boolean normalizeVector) { + super(url, directory, normalizeVector); + } + + @Override + protected void preprocessDatasetFile() { + + } + + @Override + protected void parseTrainDataset() { + trainDataset = getDatasetAsFloatMatrix("train"); + size = trainDataset.length; + dimension = trainDataset[0].length; + if (normalizeVector) { + for (float[] vector : trainDataset) { + VectorUtils.normalize(vector); + } + } + } + + @Override + protected void parseTestDataset() { + var searchVectors = getDatasetAsFloatMatrix("test"); + var ids = getDatasetAsIntMatrix("neighbors"); + var scores = getDatasetAsFloatMatrix("distances"); + testDataset = new TestDataset(searchVectors, ids, scores); + if (normalizeVector) { + for (float[] vector : searchVectors) { + VectorUtils.normalize(vector); + } + } + } + + private float[][] getDatasetAsFloatMatrix(String datasetName) { + try (HdfFile hdfFile = new HdfFile(downloadedFile.toPath())) { + var datasetNode = hdfFile.getChildren().get(datasetName); + Dataset dataset = hdfFile.getDatasetByPath(datasetNode.getPath()); + var dimension = dataset.getDimensions()[1]; + var size = dataset.getDimensions()[0]; + + float[][] matrix = new float[size][dimension]; + + for (int i = 0; i < size; i += BULK_READER_SIZE) { + int length = Math.min(BULK_READER_SIZE, size - i); + float[][] buffer = (float[][]) dataset.getData(new long[]{i, 0}, new int[]{length, dimension}); + System.arraycopy(buffer, 0, matrix, i, buffer.length); + } + return matrix; + } + } + + // todo - refactor one file once + private int[][] getDatasetAsIntMatrix(String datasetName) { + try (HdfFile hdfFile = new HdfFile(downloadedFile.toPath())) { + var datasetNode = hdfFile.getChildren().get(datasetName); + Dataset dataset = hdfFile.getDatasetByPath(datasetNode.getPath()); + var dimension = dataset.getDimensions()[1]; + var size = dataset.getDimensions()[0]; + + int[][] matrix = new int[size][dimension]; + + for (int i = 0; i < size; i += BULK_READER_SIZE) { + int length = Math.min(BULK_READER_SIZE, size - i); + int[][] buffer = (int[][]) dataset.getData(new long[]{i, 0}, new int[]{length, dimension}); + System.arraycopy(buffer, 0, matrix, i, buffer.length); + } + return matrix; + } + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java new file mode 100644 index 0000000000..a0f01caca8 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/NpyArchiveDatasetReader.java @@ -0,0 +1,115 @@ +package com.hazelcast.simulator.tests.vector.readers; + +import com.google.common.collect.Lists; +import com.google.gson.JsonArray; +import com.google.gson.JsonParser; +import com.hazelcast.simulator.tests.vector.DatasetReader; +import com.hazelcast.simulator.tests.vector.VectorUtils; +import com.hazelcast.simulator.tests.vector.model.TestDataset; +import org.apache.commons.io.FileUtils; +import org.jetbrains.bio.npy.NpyArray; +import org.jetbrains.bio.npy.NpyFile; + +import java.io.FileInputStream; +import java.io.IOException; +import java.nio.charset.Charset; +import java.nio.file.Path; +import java.util.Arrays; +import java.util.List; + +public class NpyArchiveDatasetReader extends DatasetReader { + + private Path trainDatasetFilename; + private Path testDatesetFilename; + + public NpyArchiveDatasetReader(String url, String directory, boolean normalizeVector) { + super(url, directory, normalizeVector); + } + + @Override + protected void preprocessDatasetFile() { + this.trainDatasetFilename = Path.of(workingDirectory.toString(), "vectors.npy"); + this.testDatesetFilename = Path.of(workingDirectory.toString(), "tests.jsonl"); + + if (!trainDatasetFilename.toFile().exists()) { + unpack(); + } + logger.info("Unpacking finished."); + } + + private void unpack() { + try { + TarExtractor.extractTarGZ(new FileInputStream(downloadedFile), workingDirectory); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + @Override + protected void parseTrainDataset() { + NpyArray read = NpyFile.read(trainDatasetFilename, Integer.MAX_VALUE); + var shape = read.getShape(); + size = shape[0]; + dimension = shape[1]; + trainDataset = new float[size][]; + var trainDatasetPlain = read.asFloatArray(); + for (int i = 0; i < size; i++) { + var vector = getTrainVectorPlain(i, trainDatasetPlain); + if (normalizeVector) { + VectorUtils.normalize(vector); + } + trainDataset[i] = vector; + } + } + + @Override + protected void parseTestDataset() { + try { + var parser = new JsonParser(); + List queryList = FileUtils.readLines(testDatesetFilename.toFile(), Charset.defaultCharset()); + int size = queryList.size(); + var searchVectors = new float[size][dimension]; + var searchClosestIds = new int[size][]; + var searchClosestScore = new float[size][]; + for (int i = 0; i < queryList.size(); i++) { + var queryObject = parser.parse(queryList.get(i)).getAsJsonObject(); + var jsonArray = queryObject.getAsJsonArray("query"); + var ids = queryObject.getAsJsonArray("closest_ids"); + var scores = queryObject.getAsJsonArray("closest_scores"); + searchVectors[i] = convertToFloatArray(jsonArray); + if (normalizeVector) { + VectorUtils.normalize(searchVectors[i]); + } + searchClosestIds[i] = convertToIntArray(ids); + searchClosestScore[i] = convertToFloatArray(scores); + } + testDataset = new TestDataset(searchVectors, searchClosestIds, searchClosestScore); + } catch (IOException e) { + throw new RuntimeException(e); + } + } + + + private float[] convertToFloatArray(JsonArray array) { + var result = new float[array.size()]; + for (int i = 0; i < array.size(); i++) { + result[i] = array.get(i).getAsFloat(); + } + return result; + } + + private int[] convertToIntArray(JsonArray array) { + var result = new int[array.size()]; + for (int i = 0; i < array.size(); i++) { + result[i] = array.get(i).getAsInt(); + } + return result; + } + + private float[] getTrainVectorPlain(int index, float[] trainDatasetPlain) { + if (index >= size) { + throw new RuntimeException("invalid index"); + } + return Arrays.copyOfRange(trainDatasetPlain, index * dimension, (index + 1) * dimension); + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/TarExtractor.java b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/TarExtractor.java new file mode 100644 index 0000000000..da016919c7 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/readers/TarExtractor.java @@ -0,0 +1,51 @@ +package com.hazelcast.simulator.tests.vector.readers; + +import org.apache.commons.compress.archivers.tar.TarArchiveEntry; +import org.apache.commons.compress.archivers.tar.TarArchiveInputStream; +import org.apache.commons.compress.compressors.gzip.GzipCompressorInputStream; + +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Path; + +public class TarExtractor { + + private static final int BUFFER_SIZE = 100_000; + + public static void extractTarGZ(InputStream in, Path directory) throws IOException { + GzipCompressorInputStream gzipIn = new GzipCompressorInputStream(in); + try (TarArchiveInputStream tarIn = new TarArchiveInputStream(gzipIn)) { + TarArchiveEntry entry; + + while ((entry = tarIn.getNextEntry()) != null) { + // If the entry is a directory, create the directory. + if (entry.isDirectory()) { + File f = new File(entry.getName()); + boolean created = f.mkdir(); + if (!created) { + System.out.printf("Unable to create directory '%s', during extraction of archive contents.\n", + f.getAbsolutePath()); + } + } else { + int count; + byte[] data = new byte[BUFFER_SIZE]; + var output = Path.of(directory.toString(), entry.getName()); + FileOutputStream fos = new FileOutputStream(output.toFile(), false); + try (BufferedOutputStream dest = new BufferedOutputStream(fos, BUFFER_SIZE)) { + while ((count = tarIn.read(data, 0, BUFFER_SIZE)) != -1) { + dest.write(data, 0, count); + } + } + } + } + + System.out.println("Untar completed successfully!"); + } + } +} + + + diff --git a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java new file mode 100644 index 0000000000..71463cfea8 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/tests/vector/model/TestDatasetDiffblueTest.java @@ -0,0 +1,47 @@ +package com.hazelcast.simulator.tests.vector.model; + +import org.junit.Test; + +import java.util.List; + +import static org.junit.Assert.assertEquals; + +public class TestDatasetDiffblueTest { + @Test + public void testGetPrecision_score100() { + var actual = ( + new TestDataset( + new float[][]{new float[]{0f}}, + new int[][]{new int[]{1, 2, 3, 4}}, + new float[][]{new float[]{0f}} + ) + ).getPrecision(List.of(1, 2), 0, 2); + assertEquals(1, actual, 0.0f); + } + + @Test + public void testGetPrecision_score0() { + assertEquals(0.0f, + ( + new TestDataset( + new float[][]{new float[]{0f}}, + new int[][]{new int[]{1, 2, 3, 4}, new int[]{1, 2, 1, 2}}, + new float[][]{new float[]{0f}} + ) + ).getPrecision(List.of(2), 0, 1), + 0.0f); + } + + @Test + public void testGetPrecision_score50() { + assertEquals(0.5f, + ( + new TestDataset( + new float[][]{new float[]{0f}}, + new int[][]{new int[]{1, 2, 3, 4}, new int[]{2, 5, 6}}, + new float[][]{new float[]{0f}} + ) + ).getPrecision(List.of(2, 6), 0, 2), + 0.1f); + } +} diff --git a/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java new file mode 100644 index 0000000000..77f8be0008 --- /dev/null +++ b/java/drivers/driver-hazelcast4plus/src/test/java/com/hazelcast/simulator/utils/DatasetReaderTest.java @@ -0,0 +1,66 @@ +package com.hazelcast.simulator.utils; + +import com.hazelcast.simulator.tests.vector.DatasetReader; +import org.junit.Test; + +import java.util.Arrays; + +import static org.junit.Assert.assertEquals; +import static org.junit.Assert.assertNotNull; + +public class DatasetReaderTest { + + //String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/random_keywords_1m.tgz"; + //String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/hnm.tgz"; // work + //String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/yandex_t2i_gt_100k.tgz"; // not vectors in archive + //String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/laion-small-clip.tgz"; // broken? + + String workingDirectory = "/Users/oshultseva/Downloads/dataset_output/"; + @Test + public void npy_dbpedia() { + String url = "https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz"; + + DatasetReader reader = DatasetReader.create(url, workingDirectory, true); + + assertEquals(975_000, reader.getSize()); + assertEquals(1536, reader.getDimension()); + assertEquals(1536, reader.getTestDatasetDimension()); + + assertNotNull(reader.getTrainVector(1234)); + assertEquals(5000, reader.getTestDataset().size()); + + assertEquals(0.01739898, reader.getTrainVector(0)[0], 0.0001); + assertEquals(-0.04525524, reader.getTrainVector(0)[1535], 0.0001); + } + + @Test + public void hdf5_angular() { + String url = "http://ann-benchmarks.com/glove-100-angular.hdf5"; + + DatasetReader reader = DatasetReader.create(url, workingDirectory, true); + + assertEquals(1_183_514, reader.getSize()); + assertEquals(100, reader.getDimension()); + assertEquals(100, reader.getTestDatasetDimension()); + + assertEquals(10_000, reader.getTestDataset().size()); + assertEquals(-0.02701984, reader.getTrainVector(0)[0], 0.0001); + assertEquals(-0.00503204, reader.getTrainVector(0)[99], 0.0001); + + assertEquals(0.08828659, reader.getTestDataset().getSearchVector(0)[0], 0.0001); + assertEquals(-0.0329303, reader.getTestDataset().getSearchVector(0)[99], 0.0001); + } + @Test + public void hdf5_960_euclidean() { + String url = "http://ann-benchmarks.com/gist-960-euclidean.hdf5"; + + DatasetReader reader = DatasetReader.create(url, workingDirectory, false); + + assertEquals(1_000_000, reader.getSize()); + assertEquals(960, reader.getDimension()); + assertEquals(960, reader.getTestDatasetDimension()); + + assertNotNull(reader.getTrainVector(1234)); + assertEquals(1000, reader.getTestDataset().size()); + } +} diff --git a/java/pom.xml b/java/pom.xml index 3b318bc3cd..b6dad381af 100644 --- a/java/pom.xml +++ b/java/pom.xml @@ -21,11 +21,16 @@ Backup Repository https://repo2.maven.org/maven2/ - - snapshot-repository - Hazelcast Snapshot Repository - https://oss.sonatype.org/content/repositories/snapshots + snapshot-internal + Hazelcast Internal Snapshots + https://repository.hazelcast.com/snapshot-internal/ + + false + + + true + @@ -73,13 +78,13 @@ 3.1.2 3.3.0 - 3.10 + 3.3.0.603 0.8.10 4.9.10 2.16.1 - 2.16.1 + 3.0.1 1.9.4 3.6.0 diff --git a/vector-search-simulator/README.md b/vector-search-simulator/README.md new file mode 100644 index 0000000000..171b62e0e5 --- /dev/null +++ b/vector-search-simulator/README.md @@ -0,0 +1,3 @@ +# Vector Search Simulator + +vector-test and vector-test-2 are two different environments with the same tests. \ No newline at end of file diff --git a/vector-search-simulator/vector-test-2/README.md b/vector-search-simulator/vector-test-2/README.md new file mode 100644 index 0000000000..d7758a3c4f --- /dev/null +++ b/vector-search-simulator/vector-test-2/README.md @@ -0,0 +1,36 @@ +The purpose of this simulator test is to test Hazelcast tiered-storage. + +To modify the environment, edit the `inventory_plan.yaml`. + +To create the environment. +```shell +inventory apply +``` + +To get an overview of the available instances: +```shell +cat inventory.yaml +``` + +Install the simulator and Java on the environment +```shell +inventory install java +inventory install simulator +``` + +If you want to get the best performance for your environment +```shell +inventory tune +``` + +Modify the tests by editing the `tests.yaml` file. + +To run the tests +```shell +perftest run +``` + +To destroy the environment. +```shell +inventory destroy +``` \ No newline at end of file diff --git a/vector-search-simulator/vector-test-2/ansible.cfg b/vector-search-simulator/vector-test-2/ansible.cfg new file mode 100644 index 0000000000..1d761c5638 --- /dev/null +++ b/vector-search-simulator/vector-test-2/ansible.cfg @@ -0,0 +1,10 @@ +[defaults] +host_key_checking = False +inventory = inventory.yaml + + +# For nice formatted output +stdout_callback = yaml +bin_ansible_callbacks = True + +interpreter_python=auto_silent diff --git a/vector-search-simulator/vector-test-2/aws/auto_mount.py b/vector-search-simulator/vector-test-2/aws/auto_mount.py new file mode 100755 index 0000000000..db3e7886c2 --- /dev/null +++ b/vector-search-simulator/vector-test-2/aws/auto_mount.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import subprocess +import threading + +filesystem = "ext3" + + +def list_unmounted_partitions(): + cmd = 'lsblk --noheadings --raw -o NAME,MOUNTPOINT' + lines = subprocess.check_output(cmd, shell=True, text=True).strip().splitlines() + + devices = [] + partitions = [] + + for line in lines: + record = line.split() + name = record[0] + + if not name.startswith("nvme"): + continue + + if "p" in name: + partitions.append(name) + elif len(record) == 1: + devices.append(name) + + for dev in devices: + for partition in partitions: + if partition.startswith(dev): + devices.remove(dev) + break + + return devices + + +def format_and_mount(dev): + cmd = f'sudo mkfs.{filesystem} /dev/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo mkdir -p /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo mount -t {filesystem} /dev/{dev} /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo chown ubuntu /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + +unmounted = list_unmounted_partitions() + +print(unmounted) + +jobs = [] +for dev in unmounted: + job = threading.Thread(target=format_and_mount(dev)) + jobs.append(job) + job.start() + +for job in jobs: + job.join() diff --git a/vector-search-simulator/vector-test-2/aws/main.tf b/vector-search-simulator/vector-test-2/aws/main.tf new file mode 100644 index 0000000000..bbc6077b04 --- /dev/null +++ b/vector-search-simulator/vector-test-2/aws/main.tf @@ -0,0 +1,378 @@ + +locals { + settings = yamldecode(file("../inventory_plan.yaml")) + private_key = file("../${local.settings.keypair.private_key}") + public_key = file("../${local.settings.keypair.public_key}") +} + +provider "aws" { + profile = "oshultseva" + region = local.settings.region +} + +resource "aws_default_vpc" "vpc" { + tags = { + Name = "Default VPC" + } +} + +#resource "aws_vpc" "prod-vpc" { +# cidr_block = "10.0.0.0/16" +# enable_dns_support = "true" #gives you an internal domain name +# enable_dns_hostnames = "true" #gives you an internal host name +# enable_classiclink = "false" +# instance_tenancy = "default" +# +# tags = { +# Name = "prod-vpc" +# } +#} + +#resource "aws_internet_gateway" "my_vpc_igw" { +# vpc_id = local.settings.vpc_id +# +# tags = { +# Name = "My VPC - Internet Gateway" +# } +#} + +resource "aws_key_pair" "keypair" { + key_name = "simulator-keypair-${local.settings.basename}" + public_key = local.public_key +} + +resource "aws_subnet" "subnet" { + vpc_id = local.settings.vpc_id + cidr_block = local.settings.cidr_block + availability_zone = local.settings.availability_zone + map_public_ip_on_launch = true + tags = { + Name = "Simulator Public Subnet ${local.settings.basename}" + } +} + +resource "aws_route_table" "route_table" { + vpc_id = local.settings.vpc_id + route { + cidr_block = "0.0.0.0/0" + gateway_id = local.settings.internet_gateway_id + } + + tags = { + Name = "Simulator Public Subnet Route Table ${local.settings.basename}" + } +} + +resource "aws_route_table_association" "route_table_association" { + subnet_id = aws_subnet.subnet.id + route_table_id = aws_route_table.route_table.id +} + +# ========== nodes ========================== + +# Currently there is a single placement group defined for all nodes/load generators. +# If you want to use placement_group, uncomment the commented out 'placementgroup' +# configuration in the nodes and loadgenerators sections. +resource "aws_placement_group" "cluster_placement_group" { + name = "simulator-placement-group-${local.settings.basename}" + strategy = "cluster" +} + +resource "aws_security_group" "node-sg" { + name = "simulator-security-group-node-${local.settings.basename}" + description = "Security group for the node" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator Node Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + + ingress { + description = "iperf3_udp" + from_port = 3000 + to_port = 3000 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "iperf3_tcp" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 5701 + to_port = 5801 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 9000 + to_port = 9001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast-tpc" + from_port = 11000 + to_port = 12000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "nodes" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.nodes.ami + instance_type = local.settings.nodes.instance_type + count = local.settings.nodes.count + availability_zone = local.settings.availability_zone + #placement_group = aws_placement_group.cluster_placement_group.name + vpc_security_group_ids = [ aws_security_group.node-sg.id ] + subnet_id = aws_subnet.subnet.id + tenancy = local.settings.nodes.tenancy + + tags = { + Name = "Simulator Node ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.nodes.user + } + + connection { + type = "ssh" + user = local.settings.nodes.user + private_key = local.private_key + host = self.public_ip + } + + provisioner "file" { + source = "auto_mount.py" + destination = "/tmp/auto_mount.py" + } + + provisioner "remote-exec" { + inline = [ + "python3 /tmp/auto_mount.py" + ] + } +} + +output "nodes" { + value = [aws_instance.nodes.*] +} + +# ========== load generators ========================== + +resource "aws_security_group" "loadgenerator-sg" { + name = "simulator-security-group-loadgenerator-${local.settings.basename}" + description = "Security group for the loadgenerator" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator Load Balancer Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + + ingress { + description = "iperf3_udp" + from_port = 3000 + to_port = 3000 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "iperf3_tcp" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 5701 + to_port = 5801 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 9000 + to_port = 9001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "loadgenerators" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.loadgenerators.ami + instance_type = local.settings.loadgenerators.instance_type + count = local.settings.loadgenerators.count + subnet_id = aws_subnet.subnet.id + availability_zone = local.settings.availability_zone + #placement_group = aws_placement_group.cluster_placement_group.name + vpc_security_group_ids = [ aws_security_group.loadgenerator-sg.id ] + tenancy = local.settings.loadgenerators.tenancy + tags = { + Name = "Simulator Load Generator ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.loadgenerators.user + } + + connection { + type = "ssh" + user = local.settings.loadgenerators.user + private_key = local.private_key + host = self.public_ip + } + + provisioner "file" { + source = "auto_mount.py" + destination = "/tmp/auto_mount.py" + } + + provisioner "remote-exec" { + inline = [ + "python3 /tmp/auto_mount.py" + ] + } +} + +output "loadgenerators" { + value = [aws_instance.loadgenerators.*] +} + +# ========== management center ========================== + +resource "aws_security_group" "mc-sg" { + name = "simulator-security-group-mc-${local.settings.basename}" + description = "Security group for the Management Center" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator MC Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 8080 + to_port = 8080 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 8443 + to_port = 8443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "mc" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.mc.ami + instance_type = local.settings.mc.instance_type + count = local.settings.mc.count + subnet_id = aws_subnet.subnet.id + availability_zone = local.settings.availability_zone + vpc_security_group_ids = [ aws_security_group.mc-sg.id ] + + tags = { + Name = "Simulator MC ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.mc.user + } + + connection { + type = "ssh" + user = local.settings.mc.user + private_key = file("../${local.settings.keypair.private_key}") + host = self.public_ip + } + + provisioner "remote-exec" { + inline = [ + "wget -q https://repository.hazelcast.com/download/management-center/hazelcast-management-center-5.0.tar.gz", + "tar -xzvf hazelcast-management-center-5.0.tar.gz", + "while [ ! -f /var/lib/cloud/instance/boot-finished ]; do echo 'Waiting for cloud-init...'; sleep 1; done", + "sudo apt-get -y update", + "sudo apt-get -y install openjdk-11-jdk", + "nohup hazelcast-management-center-5.0/bin/start.sh > mc.out 2>&1 &", + "sleep 2" + ] + } +} + +output "mc" { + value = [aws_instance.mc.*] +} + + + + + + diff --git a/vector-search-simulator/vector-test-2/hazelcast.xml b/vector-search-simulator/vector-test-2/hazelcast.xml new file mode 100644 index 0000000000..0da5668ba5 --- /dev/null +++ b/vector-search-simulator/vector-test-2/hazelcast.xml @@ -0,0 +1,40 @@ + + + + workers + + + + + 5701 + + + + + + + + + + + false + log4j2 + 8 + 16 + + + + + + + + + + + + + + diff --git a/vector-search-simulator/vector-test-2/inventory.yaml b/vector-search-simulator/vector-test-2/inventory.yaml new file mode 100644 index 0000000000..165743a381 --- /dev/null +++ b/vector-search-simulator/vector-test-2/inventory.yaml @@ -0,0 +1,12 @@ +loadgenerators: + hosts: + 3.124.1.127: + ansible_ssh_private_key_file: key + ansible_user: ubuntu + private_ip: 10.0.195.59 +nodes: + hosts: + 18.199.96.216: + ansible_ssh_private_key_file: key + ansible_user: ubuntu + private_ip: 10.0.195.213 diff --git a/vector-search-simulator/vector-test-2/inventory_plan.yaml b/vector-search-simulator/vector-test-2/inventory_plan.yaml new file mode 100644 index 0000000000..b19b789da9 --- /dev/null +++ b/vector-search-simulator/vector-test-2/inventory_plan.yaml @@ -0,0 +1,53 @@ +provisioner: terraform +terraform_plan: aws +# Used for naming resources; give it some unique name specific to a set of benchmarks +basename: oshultseva-lacow +# Enter something here that identifies you. +owner: oshultseva +placement_group_name: None +region: eu-central-1 +availability_zone: eu-central-1a +vpc_id: vpc-002b5a4e5f8b8ece2 +internet_gateway_id: igw-02b8fe3ab75871205 +# Change the '20' to a different octet to prevent running into conflicts. +cidr_block: 10.0.195.0/24 + +keypair: + public_key: key.pub + private_key: key + +nodes: + count: 1 + instance_type: i3en.2xlarge + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null + +loadgenerators: + count: 1 + instance_type: i3en.2xlarge + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null + +mc: + instance_type: c5.4xlarge + count: 0 + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null diff --git a/vector-search-simulator/vector-test-2/setup b/vector-search-simulator/vector-test-2/setup new file mode 100755 index 0000000000..39245047da --- /dev/null +++ b/vector-search-simulator/vector-test-2/setup @@ -0,0 +1,14 @@ +#!/bin/bash + +# This script will setup the environment + +set -e + +inventory apply + +echo "Waiting for instances to start" +sleep 60 + +inventory install java --url https://download.java.net/java/GA/jdk21/fd2272bbf8e04c3dbaee13770090416c/35/GPL/openjdk-21_linux-x64_bin.tar.gz + +inventory install simulator \ No newline at end of file diff --git a/vector-search-simulator/vector-test-2/teardown b/vector-search-simulator/vector-test-2/teardown new file mode 100755 index 0000000000..21db8c6cc3 --- /dev/null +++ b/vector-search-simulator/vector-test-2/teardown @@ -0,0 +1,7 @@ +#!/bin/bash + +# This script will teardown the environment + +set -e + +inventory destroy \ No newline at end of file diff --git a/vector-search-simulator/vector-test-2/tests.yaml b/vector-search-simulator/vector-test-2/tests.yaml new file mode 100644 index 0000000000..dec3eab69d --- /dev/null +++ b/vector-search-simulator/vector-test-2/tests.yaml @@ -0,0 +1,43 @@ +- name: map_tiered + duration: 600s + repetitions: 1 + clients: 2 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.4-SNAPSHOT + client_args: > + -Xms3g + -Xmx3g + member_args: > + -Xms3g + -Xmx3g + -Dhazelcast.diagnostics.enabled=true + -Dhazelcast.diagnostics.metric.level=info + -Dhazelcast.diagnostics.invocation.sample.period.seconds=30 + -Dhazelcast.diagnostics.pending.invocations.period.seconds=30 + -Dhazelcast.diagnostics.slowoperations.period.seconds=30 + -Dhazelcast.diagnostics.storeLatency.period.seconds=60 + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: + test: + class: com.hazelcast.simulator.hz.map.LongStringMapTest + name: map_tiered + threadCount: 40 + getProb: 0 + putProb: 0 + setProb: 1 + keyDomain: 100_000_000 + minValueLength: 1_000 + maxValueLength: 1_000 + fillOnPrepare: False diff --git a/vector-search-simulator/vector-test-2/vectors-datasets-tests-v2.yaml b/vector-search-simulator/vector-test-2/vectors-datasets-tests-v2.yaml new file mode 100644 index 0000000000..9b86c72261 --- /dev/null +++ b/vector-search-simulator/vector-test-2/vectors-datasets-tests-v2.yaml @@ -0,0 +1,52 @@ +- name: read_only + duration: 30m + repetitions: 1 + clients: 1 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.5.0-SNAPSHOT + client_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + member_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: LICENSE + test: + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector + threadCount: 1 +# ratePerSecond: 5_000 +# interval: 100us + logRateMs: 600_000 + searchProb: 1 +# numberOfIterations: 1_000_000 + # generated data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 32_123 + # index params + metric: COSINE + maxDegree: 64 + efConstruction: 512 + # search param + limit: 1 + diff --git a/vector-search-simulator/vector-test/README.md b/vector-search-simulator/vector-test/README.md new file mode 100644 index 0000000000..d7758a3c4f --- /dev/null +++ b/vector-search-simulator/vector-test/README.md @@ -0,0 +1,36 @@ +The purpose of this simulator test is to test Hazelcast tiered-storage. + +To modify the environment, edit the `inventory_plan.yaml`. + +To create the environment. +```shell +inventory apply +``` + +To get an overview of the available instances: +```shell +cat inventory.yaml +``` + +Install the simulator and Java on the environment +```shell +inventory install java +inventory install simulator +``` + +If you want to get the best performance for your environment +```shell +inventory tune +``` + +Modify the tests by editing the `tests.yaml` file. + +To run the tests +```shell +perftest run +``` + +To destroy the environment. +```shell +inventory destroy +``` \ No newline at end of file diff --git a/vector-search-simulator/vector-test/aws/auto_mount.py b/vector-search-simulator/vector-test/aws/auto_mount.py new file mode 100755 index 0000000000..db3e7886c2 --- /dev/null +++ b/vector-search-simulator/vector-test/aws/auto_mount.py @@ -0,0 +1,66 @@ +#!/usr/bin/env python3 + +import subprocess +import threading + +filesystem = "ext3" + + +def list_unmounted_partitions(): + cmd = 'lsblk --noheadings --raw -o NAME,MOUNTPOINT' + lines = subprocess.check_output(cmd, shell=True, text=True).strip().splitlines() + + devices = [] + partitions = [] + + for line in lines: + record = line.split() + name = record[0] + + if not name.startswith("nvme"): + continue + + if "p" in name: + partitions.append(name) + elif len(record) == 1: + devices.append(name) + + for dev in devices: + for partition in partitions: + if partition.startswith(dev): + devices.remove(dev) + break + + return devices + + +def format_and_mount(dev): + cmd = f'sudo mkfs.{filesystem} /dev/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo mkdir -p /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo mount -t {filesystem} /dev/{dev} /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + cmd = f'sudo chown ubuntu /mnt/{dev}' + print(cmd) + subprocess.run(cmd, shell=True, text=True, check=True) + + +unmounted = list_unmounted_partitions() + +print(unmounted) + +jobs = [] +for dev in unmounted: + job = threading.Thread(target=format_and_mount(dev)) + jobs.append(job) + job.start() + +for job in jobs: + job.join() diff --git a/vector-search-simulator/vector-test/aws/main.tf b/vector-search-simulator/vector-test/aws/main.tf new file mode 100644 index 0000000000..1a01ca5910 --- /dev/null +++ b/vector-search-simulator/vector-test/aws/main.tf @@ -0,0 +1,378 @@ + +locals { + settings = yamldecode(file("../inventory_plan.yaml")) + private_key = file("../${local.settings.keypair.private_key}") + public_key = file("../${local.settings.keypair.public_key}") +} + +provider "aws" { + profile = "oshultseva" + region = local.settings.region +} + +resource "aws_default_vpc" "vpc" { + tags = { + Name = "Default VPC" + } +} + +#resource "aws_vpc" "prod-vpc" { +# cidr_block = "10.0.0.0/16" +# enable_dns_support = "true" #gives you an internal domain name +# enable_dns_hostnames = "true" #gives you an internal host name +# enable_classiclink = "false" +# instance_tenancy = "default" +# +# tags = { +# Name = "prod-vpc" +# } +#} + +#resource "aws_internet_gateway" "my_vpc_igw" { +# vpc_id = local.settings.vpc_id +# +# tags = { +# Name = "My VPC - Internet Gateway" +# } +#} + +resource "aws_key_pair" "keypair" { + key_name = "simulator-keypair-${local.settings.basename}" + public_key = local.public_key +} + +resource "aws_subnet" "subnet" { + vpc_id = local.settings.vpc_id + cidr_block = local.settings.cidr_block + availability_zone = local.settings.availability_zone + map_public_ip_on_launch = true + tags = { + Name = "Simulator Public Subnet ${local.settings.basename}" + } +} + +resource "aws_route_table" "route_table" { + vpc_id = local.settings.vpc_id + route { + cidr_block = "0.0.0.0/0" + gateway_id = local.settings.internet_gateway_id + } + + tags = { + Name = "Simulator Public Subnet Route Table ${local.settings.basename}" + } +} + +resource "aws_route_table_association" "route_table_association" { + subnet_id = aws_subnet.subnet.id + route_table_id = aws_route_table.route_table.id +} + +# ========== nodes ========================== + +# Currently there is a single placement group defined for all nodes/load generators. +# If you want to use placement_group, uncomment the commented out 'placementgroup' +# configuration in the nodes and loadgenerators sections. +resource "aws_placement_group" "cluster_placement_group" { + name = "simulator-placement-group-${local.settings.basename}" + strategy = "cluster" +} + +resource "aws_security_group" "node-sg" { + name = "simulator-security-group-node-${local.settings.basename}" + description = "Security group for the node" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator Node Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + + ingress { + description = "iperf3_udp" + from_port = 3000 + to_port = 3000 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "iperf3_tcp" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 5701 + to_port = 5801 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 9000 + to_port = 9001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast-tpc" + from_port = 11000 + to_port = 12000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "nodes" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.nodes.ami + instance_type = local.settings.nodes.instance_type + count = local.settings.nodes.count + availability_zone = local.settings.availability_zone + #placement_group = aws_placement_group.cluster_placement_group.name + vpc_security_group_ids = [ aws_security_group.node-sg.id ] + subnet_id = aws_subnet.subnet.id + tenancy = local.settings.nodes.tenancy + + tags = { + Name = "Simulator Node ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.nodes.user + } + + connection { + type = "ssh" + user = local.settings.nodes.user + private_key = local.private_key + host = self.public_ip + } + + provisioner "file" { + source = "auto_mount.py" + destination = "/tmp/auto_mount.py" + } + + provisioner "remote-exec" { + inline = [ + "python3 /tmp/auto_mount.py" + ] + } +} + +output "nodes" { + value = [aws_instance.nodes.*] +} + +# ========== load generators ========================== + +resource "aws_security_group" "loadgenerator-sg" { + name = "simulator-security-group-loadgenerator-${local.settings.basename}" + description = "Security group for the loadgenerator" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator Load Balancer Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + + ingress { + description = "iperf3_udp" + from_port = 3000 + to_port = 3000 + protocol = "udp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "iperf3_tcp" + from_port = 3000 + to_port = 3000 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 5701 + to_port = 5801 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 9000 + to_port = 9001 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "loadgenerators" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.loadgenerators.ami + instance_type = local.settings.loadgenerators.instance_type + count = local.settings.loadgenerators.count + subnet_id = aws_subnet.subnet.id + availability_zone = local.settings.availability_zone + #placement_group = aws_placement_group.cluster_placement_group.name + vpc_security_group_ids = [ aws_security_group.loadgenerator-sg.id ] + tenancy = local.settings.loadgenerators.tenancy + tags = { + Name = "Simulator Load Generator ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.loadgenerators.user + } + + connection { + type = "ssh" + user = local.settings.loadgenerators.user + private_key = local.private_key + host = self.public_ip + } + + provisioner "file" { + source = "auto_mount.py" + destination = "/tmp/auto_mount.py" + } + + provisioner "remote-exec" { + inline = [ + "python3 /tmp/auto_mount.py" + ] + } +} + +output "loadgenerators" { + value = [aws_instance.loadgenerators.*] +} + +# ========== management center ========================== + +resource "aws_security_group" "mc-sg" { + name = "simulator-security-group-mc-${local.settings.basename}" + description = "Security group for the Management Center" + vpc_id = local.settings.vpc_id + + tags = { + Name = "Simulator MC Security Group ${local.settings.basename}", + Owner = local.settings.owner + } + + ingress { + description = "SSH" + from_port = 22 + to_port = 22 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Hazelcast" + from_port = 8080 + to_port = 8080 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + ingress { + description = "Simulator" + from_port = 8443 + to_port = 8443 + protocol = "tcp" + cidr_blocks = ["0.0.0.0/0"] + } + + egress { + from_port = 0 + to_port = 0 + protocol = "-1" + cidr_blocks = ["0.0.0.0/0"] + } +} + +resource "aws_instance" "mc" { + key_name = aws_key_pair.keypair.key_name + ami = local.settings.mc.ami + instance_type = local.settings.mc.instance_type + count = local.settings.mc.count + subnet_id = aws_subnet.subnet.id + availability_zone = local.settings.availability_zone + vpc_security_group_ids = [ aws_security_group.mc-sg.id ] + + tags = { + Name = "Simulator MC ${local.settings.basename}" + Owner = local.settings.owner + "passthrough:ansible_ssh_private_key_file" = local.settings.keypair.private_key + "passthrough:ansible_user" = local.settings.mc.user + } + + connection { + type = "ssh" + user = local.settings.mc.user + private_key = file("../${local.settings.keypair.private_key}") + host = self.public_ip + } + + provisioner "remote-exec" { + inline = [ + "wget -q https://repository.hazelcast.com/download/management-center/hazelcast-management-center-5.0.tar.gz", + "tar -xzvf hazelcast-management-center-5.0.tar.gz", + "while [ ! -f /var/lib/cloud/instance/boot-finished ]; do echo 'Waiting for cloud-init...'; sleep 1; done", + "sudo apt-get -y update", + "sudo apt-get -y install openjdk-11-jdk", + "nohup hazelcast-management-center-5.0/bin/start.sh > mc.out 2>&1 &", + "sleep 2" + ] + } +} + +output "mc" { + value = [aws_instance.mc.*] +} + + + + + + diff --git a/vector-search-simulator/vector-test/hazelcast.xml b/vector-search-simulator/vector-test/hazelcast.xml new file mode 100644 index 0000000000..a47d3273e2 --- /dev/null +++ b/vector-search-simulator/vector-test/hazelcast.xml @@ -0,0 +1,40 @@ + + + + workers + + + + + 5701 + + + + + + + + + + + false + log4j2 + 16 + + + + + + + + + + + + + + + diff --git a/vector-search-simulator/vector-test/inventory.yaml b/vector-search-simulator/vector-test/inventory.yaml new file mode 100644 index 0000000000..bc54e4615d --- /dev/null +++ b/vector-search-simulator/vector-test/inventory.yaml @@ -0,0 +1,12 @@ +loadgenerators: + hosts: + 3.68.98.252: + ansible_ssh_private_key_file: key + ansible_user: ubuntu + private_ip: 10.0.194.113 +nodes: + hosts: + 3.70.95.192: + ansible_ssh_private_key_file: key + ansible_user: ubuntu + private_ip: 10.0.194.248 diff --git a/vector-search-simulator/vector-test/inventory_plan.yaml b/vector-search-simulator/vector-test/inventory_plan.yaml new file mode 100644 index 0000000000..d177969a28 --- /dev/null +++ b/vector-search-simulator/vector-test/inventory_plan.yaml @@ -0,0 +1,53 @@ +provisioner: terraform +terraform_plan: aws +# Used for naming resources; give it some unique name specific to a set of benchmarks +basename: oshultseva-hcbsg +# Enter something here that identifies you. +owner: oshultseva +placement_group_name: None +region: eu-central-1 +availability_zone: eu-central-1a +vpc_id: vpc-002b5a4e5f8b8ece2 +internet_gateway_id: igw-02b8fe3ab75871205 +# Change the '20' to a different octet to prevent running into conflicts. +cidr_block: 10.0.194.0/24 + +keypair: + public_key: key.pub + private_key: key + +nodes: + count: 1 + instance_type: i3en.2xlarge + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null + +loadgenerators: + count: 1 + instance_type: i3en.2xlarge + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null + +mc: + instance_type: c5.4xlarge + count: 0 + # default AWS AMI + # ami: ami-05cafdf7c9f772ad2 + # user: ec2-user + # ubuntu + ami: ami-04e601abe3e1a910f + user: ubuntu + placement_group: None + tenancy: null diff --git a/vector-search-simulator/vector-test/setup b/vector-search-simulator/vector-test/setup new file mode 100755 index 0000000000..39245047da --- /dev/null +++ b/vector-search-simulator/vector-test/setup @@ -0,0 +1,14 @@ +#!/bin/bash + +# This script will setup the environment + +set -e + +inventory apply + +echo "Waiting for instances to start" +sleep 60 + +inventory install java --url https://download.java.net/java/GA/jdk21/fd2272bbf8e04c3dbaee13770090416c/35/GPL/openjdk-21_linux-x64_bin.tar.gz + +inventory install simulator \ No newline at end of file diff --git a/vector-search-simulator/vector-test/teardown b/vector-search-simulator/vector-test/teardown new file mode 100755 index 0000000000..21db8c6cc3 --- /dev/null +++ b/vector-search-simulator/vector-test/teardown @@ -0,0 +1,7 @@ +#!/bin/bash + +# This script will teardown the environment + +set -e + +inventory destroy \ No newline at end of file diff --git a/vector-search-simulator/vector-test/v_dbpedia_openai_1M_all.yaml b/vector-search-simulator/vector-test/v_dbpedia_openai_1M_all.yaml new file mode 100644 index 0000000000..6fead71013 --- /dev/null +++ b/vector-search-simulator/vector-test/v_dbpedia_openai_1M_all.yaml @@ -0,0 +1,188 @@ +- name: dbpedia_openai_1M + duration: 60m + repetitions: 1 + clients: 1 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.5.0-SNAPSHOT + client_args: > + -Xms30g + -Xmx30g + --add-modules jdk.incubator.vector + --enable-preview + --enable-native-access=ALL-UNNAMED + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + member_args: > + -Xms60g + -Xmx60g + --add-modules jdk.incubator.vector + --enable-preview + --enable-native-access=ALL-UNNAMED + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: [LICENCE] + parallel: False + test: + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_16_ef_128 + threadCount: 1 + # ratePerSecond: 5_000 + # interval: 100us + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 16 + efConstruction: 128 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_32_ef_128 + threadCount: 1 + # ratePerSecond: 5_000 + # interval: 100us + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 32 + efConstruction: 128 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_64_ef_128 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 64 + efConstruction: 128 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_16_ef_256 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 16 + efConstruction: 256 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_32_ef_256 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 32 + efConstruction: 256 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_64_ef_256 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 64 + efConstruction: 256 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_16_ef_512 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 16 + efConstruction: 512 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_32_ef_512 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 32 + efConstruction: 512 + normalize: false + # search params + limit: 10 + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector_m_64_ef_512 + threadCount: 1 + logRateMs: 60_000 + searchProb: 1 + # data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_ + # index params + metric: DOT + maxDegree: 64 + efConstruction: 512 + normalize: false + # search params + limit: 10 \ No newline at end of file diff --git a/vector-search-simulator/vector-test/vectors-datasets-tests-v2.yaml b/vector-search-simulator/vector-test/vectors-datasets-tests-v2.yaml new file mode 100644 index 0000000000..89a62d12f7 --- /dev/null +++ b/vector-search-simulator/vector-test/vectors-datasets-tests-v2.yaml @@ -0,0 +1,51 @@ +- name: read_only + duration: 30m + repetitions: 1 + clients: 1 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.5.0-SNAPSHOT + client_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + member_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: LICENSE + test: + - class: com.hazelcast.simulator.tests.vector.VectorCollectionSearchDatasetTest + name: vector + threadCount: 1 + # ratePerSecond: 5_000 + # interval: 100us + logRateMs: 60_000 + searchProb: 1 + # numberOfIterations: 1_000_000 + # data parameters + datasetUrl: http://ann-benchmarks.com/gist-960-euclidean.hdf5 + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 10_123 + # index params + metric: EUCLIDEAN + maxDegree: 32 + efConstruction: 512 + # search param + limit: 1 \ No newline at end of file diff --git a/vector-search-simulator/vector-test/vectors-datasets-tests.yaml b/vector-search-simulator/vector-test/vectors-datasets-tests.yaml new file mode 100644 index 0000000000..d422aa1773 --- /dev/null +++ b/vector-search-simulator/vector-test/vectors-datasets-tests.yaml @@ -0,0 +1,52 @@ +- name: read_only + duration: 30m + repetitions: 1 + clients: 1 + members: 1 + driver: hazelcast-enterprise5 + version: maven=5.5.0-SNAPSHOT + client_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + member_args: > + -Xms30g + -Xmx30g + --add-modules java.se + --add-exports java.base/jdk.internal.ref=ALL-UNNAMED + --add-opens java.base/java.lang=ALL-UNNAMED + --add-opens java.base/sun.nio.ch=ALL-UNNAMED + --add-opens java.management/sun.management=ALL-UNNAMED + --add-opens jdk.management/com.sun.management.internal=ALL-UNNAMED + loadgenerator_hosts: loadgenerators + node_hosts: nodes + verify_enabled: False + performance_monitor_interval_seconds: 1 + warmup_seconds: 0 + cooldown_seconds: 0 + license_key: LICENSE + test: + - class: com.hazelcast.simulator.tests.vector.VectorCollectionFromUrlTest + name: vector + threadCount: 1 +# ratePerSecond: 5_000 +# interval: 100us + logRateMs: 60_000 + searchProb: 1 +# numberOfIterations: 1_000_000 + # generated data parameters + datasetUrl: https://storage.googleapis.com/ann-filtered-benchmark/datasets/dbpedia_openai_1M.tgz + workingDirectory: /mnt/nvme1n1/workingDirectory/ + # loadFirst: 30_000 + # index params + metric: COSINE + maxDegree: 32 + efConstruction: 512 + # search param + limit: 1 +