-
Notifications
You must be signed in to change notification settings - Fork 74
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Vector db perfromance test #2217
Draft
shultseva
wants to merge
11
commits into
hazelcast:master
Choose a base branch
from
shultseva:db_vector_performance_test
base: master
Could not load branches
Branch not found: {{ refName }}
Loading
Could not load tags
Nothing to show
Loading
Are you sure you want to change the base?
Some commits from the old base branch may be removed from the timeline,
and old review comments may become outdated.
Draft
Changes from 10 commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
e09f004
vector db test
shultseva f83f2a7
vector db test
shultseva 6c8ed0f
vector db test
shultseva a1fc789
vector db test
shultseva 119d7c6
vector db test
shultseva 403c538
vector db test
shultseva 0b0f294
update repositories and fix rsync for OSX clients
abramche 6d3f481
Merge pull request #3 from shultseva/update_repositories
shultseva d0e858c
fix failing private snapshot repository lookup and update README.md
abramche d1c2d97
Merge pull request #4 from shultseva/fully_qualified_internal_snapsho…
shultseva aee8f32
Merge branch 'master' into db_vector_performance_test
abramche File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
141 changes: 141 additions & 0 deletions
141
...iver-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/DatasetReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,141 @@ | ||
package com.hazelcast.simulator.tests.vector; | ||
|
||
import com.hazelcast.simulator.tests.vector.model.TestDataset; | ||
import com.hazelcast.simulator.tests.vector.readers.HDF5DatasetReader; | ||
import com.hazelcast.simulator.tests.vector.readers.NpyArchiveDatasetReader; | ||
import org.apache.commons.io.FileUtils; | ||
import org.apache.commons.io.FilenameUtils; | ||
import org.apache.commons.io.IOUtils; | ||
import org.apache.http.HttpResponse; | ||
import org.apache.http.client.ResponseHandler; | ||
import org.apache.http.client.methods.HttpGet; | ||
import org.apache.http.impl.client.CloseableHttpClient; | ||
import org.apache.http.impl.client.HttpClients; | ||
import org.apache.http.impl.client.LaxRedirectStrategy; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
|
||
import java.io.File; | ||
import java.io.IOException; | ||
import java.io.InputStream; | ||
import java.net.MalformedURLException; | ||
import java.net.URI; | ||
import java.net.URL; | ||
import java.nio.file.Path; | ||
|
||
public abstract class DatasetReader { | ||
|
||
private final URL datasetURL; | ||
|
||
protected final Path workingDirectory; | ||
|
||
protected final File downloadedFile; | ||
|
||
protected float[][] trainDataset; | ||
|
||
protected TestDataset testDataset; | ||
|
||
protected int dimension; | ||
|
||
protected int size; | ||
|
||
protected Boolean normalizeVector = false; | ||
|
||
protected final Logger logger = LogManager.getLogger(getClass()); | ||
|
||
public DatasetReader(String url, String directory) { | ||
this(url, directory, false); | ||
} | ||
public DatasetReader(String url, String directory, Boolean normalizeVector) { | ||
try { | ||
this.datasetURL = URI.create(url).toURL(); | ||
this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile())); | ||
this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile(); | ||
this.normalizeVector = normalizeVector; | ||
|
||
logger.info("Start downloading file from {}", datasetURL); | ||
if (!downloadedFile.exists()) { | ||
download(); | ||
} | ||
logger.info("File downloaded to {}. Start unpacking...", downloadedFile); | ||
|
||
preprocessDatasetFile(); | ||
parseTrainDataset(); | ||
parseTestDataset(); | ||
logger.info("Dataset reader is initialized"); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
protected abstract void preprocessDatasetFile(); | ||
protected abstract void parseTrainDataset(); | ||
protected abstract void parseTestDataset(); | ||
|
||
private void cleanup() { | ||
try { | ||
FileUtils.cleanDirectory(workingDirectory.toFile()); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
public float[] getTrainVector(int index) { | ||
return trainDataset[index]; | ||
} | ||
|
||
public TestDataset getTestDataset() { | ||
return testDataset; | ||
} | ||
|
||
public int getDimension() { | ||
return dimension; | ||
} | ||
|
||
public int getSize() { | ||
return size; | ||
} | ||
|
||
public int getTestDatasetDimension() { | ||
return testDataset.getDimension(); | ||
} | ||
|
||
private void download() { | ||
CloseableHttpClient httpClient = HttpClients.custom() | ||
.setRedirectStrategy(new LaxRedirectStrategy()) | ||
.build(); | ||
try { | ||
HttpGet get = new HttpGet(datasetURL.toURI()); | ||
httpClient.execute(get, new FileDownloadResponseHandler(downloadedFile)); | ||
} catch (Exception e) { | ||
throw new IllegalStateException(e); | ||
} finally { | ||
IOUtils.closeQuietly(httpClient); | ||
} | ||
} | ||
|
||
|
||
private record FileDownloadResponseHandler(File target) implements ResponseHandler<Void> { | ||
|
||
@Override | ||
public Void handleResponse(HttpResponse response) throws IOException { | ||
InputStream source = response.getEntity().getContent(); | ||
FileUtils.copyInputStreamToFile(source, this.target); | ||
return null; | ||
} | ||
} | ||
|
||
public static DatasetReader create(String url, String directory, boolean normalizeVector) { | ||
try { | ||
URL datasetUrl = URI.create(url).toURL(); | ||
var ext = FilenameUtils.getExtension(datasetUrl.getFile()); | ||
return switch (ext) { | ||
case "hdf5" -> new HDF5DatasetReader(url, directory, normalizeVector); | ||
case "tgz" -> new NpyArchiveDatasetReader(url, directory, normalizeVector); | ||
default -> throw new UnsupportedOperationException("File " + ext + " is not supported"); | ||
}; | ||
} catch (MalformedURLException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
} |
149 changes: 149 additions & 0 deletions
149
...r-hazelcast4plus/src/main/java/com/hazelcast/simulator/tests/vector/NpyDatasetReader.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,149 @@ | ||
package com.hazelcast.simulator.tests.vector; | ||
|
||
import com.google.gson.JsonArray; | ||
import com.google.gson.JsonParser; | ||
import com.hazelcast.simulator.tests.vector.readers.TarExtractor; | ||
import org.apache.commons.io.FileUtils; | ||
import org.apache.commons.io.FilenameUtils; | ||
import org.apache.logging.log4j.LogManager; | ||
import org.apache.logging.log4j.Logger; | ||
import org.jetbrains.bio.npy.NpyArray; | ||
import org.jetbrains.bio.npy.NpyFile; | ||
|
||
import java.io.File; | ||
import java.io.FileInputStream; | ||
import java.io.IOException; | ||
import java.net.URI; | ||
import java.net.URL; | ||
import java.nio.charset.Charset; | ||
import java.nio.file.Path; | ||
import java.util.Arrays; | ||
import java.util.List; | ||
|
||
@Deprecated | ||
public class NpyDatasetReader { | ||
|
||
private final URL datasetURL; | ||
|
||
private final Path workingDirectory; | ||
|
||
private final File downloadedFile; | ||
|
||
private final Path vectorsNameFile; | ||
private final Path testNameFile; | ||
|
||
private float[] vectorsPlain; | ||
|
||
private List<float[]> query; | ||
|
||
private int dimension; | ||
private int size; | ||
|
||
protected final Logger logger = LogManager.getLogger(getClass()); | ||
|
||
public NpyDatasetReader(String url, String directory) { | ||
try { | ||
this.datasetURL = URI.create(url).toURL(); | ||
this.workingDirectory = Path.of(directory, FilenameUtils.getBaseName(datasetURL.getFile())); | ||
this.downloadedFile = Path.of(workingDirectory.toString(), FilenameUtils.getName(datasetURL.getFile())).toFile(); | ||
this.vectorsNameFile = Path.of(workingDirectory.toString(), "vectors.npy"); | ||
this.testNameFile = Path.of(workingDirectory.toString(), "tests.jsonl"); | ||
|
||
logger.info("Start downloading file from " + datasetURL); | ||
if (!downloadedFile.exists()) { | ||
download(); | ||
} | ||
logger.info("File downloaded to " + downloadedFile + ". Start unpacking..."); | ||
|
||
if (!vectorsNameFile.toFile().exists()) { | ||
unpack(); | ||
} | ||
logger.info("Unpacking finished. Start parse vectors..."); | ||
|
||
parseArray(); | ||
parseTestCases(); | ||
logger.info("Dataset reader is initialized"); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
private void download() { | ||
try { | ||
FileUtils.copyURLToFile( | ||
datasetURL, | ||
downloadedFile, | ||
120_000, | ||
60_000 * 60); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
private void unpack() throws IOException { | ||
TarExtractor.extractTarGZ(new FileInputStream(downloadedFile), workingDirectory); | ||
} | ||
|
||
private void parseArray() { | ||
NpyArray read = NpyFile.read(vectorsNameFile, Integer.MAX_VALUE); | ||
var shape = read.getShape(); | ||
size = shape[0]; | ||
dimension = shape[1]; | ||
vectorsPlain = read.asFloatArray(); | ||
} | ||
|
||
private void parseTestCases() { | ||
try { | ||
var parser = new JsonParser(); | ||
List<String> queryList = FileUtils.readLines(testNameFile.toFile(), Charset.defaultCharset()); | ||
query = queryList.stream() | ||
.map(query -> parser.parse(query).getAsJsonObject().getAsJsonArray("query")) | ||
.map(this::convert) | ||
.toList(); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
private float[] convert(JsonArray array) { | ||
var result = new float[array.size()]; | ||
for (int i = 0; i < array.size(); i++) { | ||
result[i] = array.get(i).getAsFloat(); | ||
} | ||
return result; | ||
} | ||
|
||
private void cleanup() { | ||
try { | ||
FileUtils.cleanDirectory(workingDirectory.toFile()); | ||
} catch (IOException e) { | ||
throw new RuntimeException(e); | ||
} | ||
} | ||
|
||
public float[] read(int index) { | ||
if (index >= size) { | ||
throw new RuntimeException("invalid index"); | ||
} | ||
return Arrays.copyOfRange(vectorsPlain, index * dimension, (index + 1) * dimension); | ||
} | ||
|
||
public List<float[]> getTestCases() { | ||
return query; | ||
} | ||
|
||
public int getDimension() { | ||
return dimension; | ||
} | ||
|
||
public int getSize() { | ||
return size; | ||
} | ||
|
||
public int getQueryDimension() { | ||
if(query.isEmpty()) { | ||
return 0; | ||
} | ||
return query.get(0).length; | ||
} | ||
} |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
needs to be 5.5.0
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
aee8f32
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
oops, looks like there were more changes in master that got merged into the PR -- need to scroll through them again