Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Vector db perfromance test #2217

Draft
wants to merge 11 commits into
base: master
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 10 commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,11 @@ To install Java on the remote machines call:
inventory install java
```

Note that starting from 5.5.0-SNAPSHOT onwards, the private snapshots repository is used, so access to this repository
needs to be configured both locally (where simulator is being built and perftest is kicked off) and remotely (on load
generators and so on...).
See [here](https://hazelcast.atlassian.net/wiki/spaces/DI/pages/5116493883/How+to+access+private+snapshot+maven+artifacts+and+docker+images) for details.

You can pass a custom URL to configure the correct JVM. To get a listing of examples URL's call:

```shell
Expand Down
2 changes: 1 addition & 1 deletion bin/hidden/agent
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ if [ -f "agent.pid" ] ; then
fi

if [ -z "${SIMULATOR_HOME}" ] ; then
export SIMULATOR_HOME=$(cd $(dirname $(readlink -f $0 2> /dev/null || readlink $0 2> /dev/null || echo $0))/../.. && pwd)
export SIMULATOR_HOME=$(cd $(dirname $(readlink -f $0 2> /dev/null || readlink -f $0 2> /dev/null || echo $0))/../.. && pwd)
fi

export JAVA_OPTS="-server -Xmx2g -Xms512m -XX:+HeapDumpOnOutOfMemoryError ${JAVA_EXTRA_OPTS}"
Expand Down
4 changes: 3 additions & 1 deletion java/drivers/driver-hazelcast4plus/conf/install.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,7 +69,9 @@ def _get_remote_repo(is_enterprise:bool, version:str):
return "https://oss.sonatype.org/content/repositories/releases"
else:
if version.endswith("-SNAPSHOT"):
return "https://repository.hazelcast.com/snapshot"
# maven ignores settings.xml authentication unless forced with fully qualified remoteRepositories construct
# https://maven.apache.org/plugins/maven-dependency-plugin/get-mojo.html
return "snapshot-internal::::https://repository.hazelcast.com/snapshot-internal"
else:
return "https://repository.hazelcast.com/release"

Expand Down
44 changes: 43 additions & 1 deletion java/drivers/driver-hazelcast4plus/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
</parent>

<properties>
<hazelcast.version>5.4.0-SNAPSHOT</hazelcast.version>
<hazelcast.version>5.5.0-SNAPSHOT</hazelcast.version>
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

needs to be 5.5.0

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

oops, looks like there were more changes in master that got merged into the PR -- need to scroll through them again

<main.basedir>${project.parent.basedir}</main.basedir>
<netty.version>4.1.94.Final</netty.version>
<netty-tcnative.version>2.0.34.Final</netty-tcnative.version>
Expand Down Expand Up @@ -71,6 +71,12 @@
<version>${hazelcast.version}</version>
</dependency>

<dependency>
<groupId>com.hazelcast</groupId>
<artifactId>hazelcast-enterprise</artifactId>
<version>${hazelcast.version}</version>
</dependency>

<dependency>
<groupId>javax.cache</groupId>
<artifactId>cache-api</artifactId>
Expand Down Expand Up @@ -136,5 +142,41 @@
<version>${netty.version}</version>
</dependency>

<dependency>
<groupId>org.apache.commons</groupId>
<artifactId>commons-lang3</artifactId>
<version>3.14.0</version>
</dependency>

<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>${httpclient.version}</version>
</dependency>

<dependency>
<groupId>ai.djl</groupId>
<artifactId>api</artifactId>
<version>0.27.0</version>
</dependency>

<dependency>
<groupId>org.jetbrains.bio</groupId>
<artifactId>npy</artifactId>
<version>0.3.5</version>
</dependency>

<dependency>
<groupId>io.jhdf</groupId>
<artifactId>jhdf</artifactId>
<version>0.6.10</version>
</dependency>

<dependency>
<groupId>org.jctools</groupId>
<artifactId>jctools-core</artifactId>
<version>4.0.3</version>
</dependency>

</dependencies>
</project>
Original file line number Diff line number Diff line change
@@ -0,0 +1,141 @@
package com.hazelcast.simulator.tests.vector;

import com.hazelcast.simulator.tests.vector.model.TestDataset;
import com.hazelcast.simulator.tests.vector.readers.HDF5DatasetReader;
import com.hazelcast.simulator.tests.vector.readers.NpyArchiveDatasetReader;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.commons.io.IOUtils;
import org.apache.http.HttpResponse;
import org.apache.http.client.ResponseHandler;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.impl.client.LaxRedirectStrategy;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;

import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.net.MalformedURLException;
import java.net.URI;
import java.net.URL;
import java.nio.file.Path;

public abstract class DatasetReader {

private final URL datasetURL;

protected final Path workingDirectory;

protected final File downloadedFile;

protected float[][] trainDataset;

protected TestDataset testDataset;

protected int dimension;

protected int size;

protected Boolean normalizeVector = false;

protected final Logger logger = LogManager.getLogger(getClass());

public DatasetReader(String url, String directory) {
this(url, directory, false);
}
public DatasetReader(String url, String directory, Boolean normalizeVector) {
try {
this.datasetURL = URI.create(url).toURL();
this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile()));
this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile();
this.normalizeVector = normalizeVector;

logger.info("Start downloading file from {}", datasetURL);
if (!downloadedFile.exists()) {
download();
}
logger.info("File downloaded to {}. Start unpacking...", downloadedFile);

preprocessDatasetFile();
parseTrainDataset();
parseTestDataset();
logger.info("Dataset reader is initialized");
} catch (IOException e) {
throw new RuntimeException(e);
}
}

protected abstract void preprocessDatasetFile();
protected abstract void parseTrainDataset();
protected abstract void parseTestDataset();

private void cleanup() {
try {
FileUtils.cleanDirectory(workingDirectory.toFile());
} catch (IOException e) {
throw new RuntimeException(e);
}
}

public float[] getTrainVector(int index) {
return trainDataset[index];
}

public TestDataset getTestDataset() {
return testDataset;
}

public int getDimension() {
return dimension;
}

public int getSize() {
return size;
}

public int getTestDatasetDimension() {
return testDataset.getDimension();
}

private void download() {
CloseableHttpClient httpClient = HttpClients.custom()
.setRedirectStrategy(new LaxRedirectStrategy())
.build();
try {
HttpGet get = new HttpGet(datasetURL.toURI());
httpClient.execute(get, new FileDownloadResponseHandler(downloadedFile));
} catch (Exception e) {
throw new IllegalStateException(e);
} finally {
IOUtils.closeQuietly(httpClient);
}
}


private record FileDownloadResponseHandler(File target) implements ResponseHandler<Void> {

@Override
public Void handleResponse(HttpResponse response) throws IOException {
InputStream source = response.getEntity().getContent();
FileUtils.copyInputStreamToFile(source, this.target);
return null;
}
}

public static DatasetReader create(String url, String directory, boolean normalizeVector) {
try {
URL datasetUrl = URI.create(url).toURL();
var ext = FilenameUtils.getExtension(datasetUrl.getFile());
return switch (ext) {
case "hdf5" -> new HDF5DatasetReader(url, directory, normalizeVector);
case "tgz" -> new NpyArchiveDatasetReader(url, directory, normalizeVector);
default -> throw new UnsupportedOperationException("File " + ext + " is not supported");
};
} catch (MalformedURLException e) {
throw new RuntimeException(e);
}
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,149 @@
package com.hazelcast.simulator.tests.vector;

import com.google.gson.JsonArray;
import com.google.gson.JsonParser;
import com.hazelcast.simulator.tests.vector.readers.TarExtractor;
import org.apache.commons.io.FileUtils;
import org.apache.commons.io.FilenameUtils;
import org.apache.logging.log4j.LogManager;
import org.apache.logging.log4j.Logger;
import org.jetbrains.bio.npy.NpyArray;
import org.jetbrains.bio.npy.NpyFile;

import java.io.File;
import java.io.FileInputStream;
import java.io.IOException;
import java.net.URI;
import java.net.URL;
import java.nio.charset.Charset;
import java.nio.file.Path;
import java.util.Arrays;
import java.util.List;

@Deprecated
public class NpyDatasetReader {

private final URL datasetURL;

private final Path workingDirectory;

private final File downloadedFile;

private final Path vectorsNameFile;
private final Path testNameFile;

private float[] vectorsPlain;

private List<float[]> query;

private int dimension;
private int size;

protected final Logger logger = LogManager.getLogger(getClass());

public NpyDatasetReader(String url, String directory) {
try {
this.datasetURL = URI.create(url).toURL();
this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile()));
this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile();
this.vectorsNameFile = Path.of(workingDirectory.toString(), "vectors.npy");
this.testNameFile = Path.of(workingDirectory.toString(), "tests.jsonl");

logger.info("Start downloading file from " + datasetURL);
if (!downloadedFile.exists()) {
download();
}
logger.info("File downloaded to " + downloadedFile + ". Start unpacking...");

if (!vectorsNameFile.toFile().exists()) {
unpack();
}
logger.info("Unpacking finished. Start parse vectors...");

parseArray();
parseTestCases();
logger.info("Dataset reader is initialized");
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private void download() {
try {
FileUtils.copyURLToFile(
datasetURL,
downloadedFile,
120_000,
60_000 * 60);
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private void unpack() throws IOException {
TarExtractor.extractTarGZ(new FileInputStream(downloadedFile), workingDirectory);
}

private void parseArray() {
NpyArray read = NpyFile.read(vectorsNameFile, Integer.MAX_VALUE);
var shape = read.getShape();
size = shape[0];
dimension = shape[1];
vectorsPlain = read.asFloatArray();
}

private void parseTestCases() {
try {
var parser = new JsonParser();
List<String> queryList = FileUtils.readLines(testNameFile.toFile(), Charset.defaultCharset());
query = queryList.stream()
.map(query -> parser.parse(query).getAsJsonObject().getAsJsonArray("query"))
.map(this::convert)
.toList();
} catch (IOException e) {
throw new RuntimeException(e);
}
}

private float[] convert(JsonArray array) {
var result = new float[array.size()];
for (int i = 0; i < array.size(); i++) {
result[i] = array.get(i).getAsFloat();
}
return result;
}

private void cleanup() {
try {
FileUtils.cleanDirectory(workingDirectory.toFile());
} catch (IOException e) {
throw new RuntimeException(e);
}
}

public float[] read(int index) {
if (index >= size) {
throw new RuntimeException("invalid index");
}
return Arrays.copyOfRange(vectorsPlain, index * dimension, (index + 1) * dimension);
}

public List<float[]> getTestCases() {
return query;
}

public int getDimension() {
return dimension;
}

public int getSize() {
return size;
}

public int getQueryDimension() {
if(query.isEmpty()) {
return 0;
}
return query.get(0).length;
}
}
Loading