Skip to content

Commit

Permalink
ufal/be-s3-checksum-fix (#487)
Browse files Browse the repository at this point in the history
* Cherry picked fix S3 optimalization fixes from Vanilla

* Cherry picked adding of pagination on cleanup

* Updated Sync services according to S3 optimization fix

* The checksum value is compared between S3 and local assetstore and the new result is inserted to the database.

* Added docs and refactoring

* The checksum values are exposed by REST API

* Fixed failing tests

---------

Co-authored-by: Tim Donohue <[email protected]>
Co-authored-by: Luca Giamminonni <[email protected]>
  • Loading branch information
3 people authored Jan 3, 2024
1 parent faf9e42 commit bf4f503
Show file tree
Hide file tree
Showing 33 changed files with 1,245 additions and 764 deletions.
24 changes: 24 additions & 0 deletions dspace-api/pom.xml
Original file line number Diff line number Diff line change
Expand Up @@ -844,6 +844,24 @@
</exclusion>
</exclusions>
</dependency>

<dependency>
<groupId>io.findify</groupId>
<artifactId>s3mock_2.13</artifactId>
<version>0.2.6</version>
<scope>test</scope>
<exclusions>
<exclusion>
<groupId>com.amazonawsl</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
</exclusion>
<exclusion>
<groupId>com.amazonaws</groupId>
<artifactId>aws-java-sdk-s3</artifactId>
</exclusion>
</exclusions>
</dependency>

</dependencies>

<dependencyManagement>
Expand Down Expand Up @@ -907,6 +925,12 @@
<artifactId>swagger-core</artifactId>
<version>1.6.2</version>
</dependency>
<dependency>
<groupId>org.scala-lang</groupId>
<artifactId>scala-library</artifactId>
<version>2.13.2</version>
<scope>test</scope>
</dependency>
</dependencies>
</dependencyManagement>

Expand Down
49 changes: 43 additions & 6 deletions dspace-api/src/main/java/org/dspace/checker/CheckerCommand.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,10 +7,13 @@
*/
package org.dspace.checker;

import static org.dspace.storage.bitstore.SyncBitstreamStorageServiceImpl.SYNCHRONIZED_STORES_NUMBER;

import java.io.IOException;
import java.sql.SQLException;
import java.util.Date;
import java.util.Map;
import java.util.Objects;

import org.apache.commons.collections4.MapUtils;
import org.apache.logging.log4j.Logger;
Expand All @@ -20,8 +23,8 @@
import org.dspace.checker.service.MostRecentChecksumService;
import org.dspace.content.Bitstream;
import org.dspace.core.Context;
import org.dspace.storage.bitstore.SyncBitstreamStorageServiceImpl;
import org.dspace.storage.bitstore.factory.StorageServiceFactory;
import org.dspace.storage.bitstore.service.BitstreamStorageService;

/**
* <p>
Expand Down Expand Up @@ -55,7 +58,7 @@ public final class CheckerCommand {
* Checksum history Data access object
*/
private ChecksumHistoryService checksumHistoryService = null;
private BitstreamStorageService bitstreamStorageService = null;
private SyncBitstreamStorageServiceImpl bitstreamStorageService = null;
private ChecksumResultService checksumResultService = null;

/**
Expand Down Expand Up @@ -86,7 +89,7 @@ public final class CheckerCommand {
public CheckerCommand(Context context) {
checksumService = CheckerServiceFactory.getInstance().getMostRecentChecksumService();
checksumHistoryService = CheckerServiceFactory.getInstance().getChecksumHistoryService();
bitstreamStorageService = StorageServiceFactory.getInstance().getBitstreamStorageService();
bitstreamStorageService = StorageServiceFactory.getInstance().getSyncBitstreamStorageService();
checksumResultService = CheckerServiceFactory.getInstance().getChecksumResultService();
this.context = context;
}
Expand Down Expand Up @@ -245,7 +248,9 @@ protected void processBitstream(MostRecentChecksum info) throws SQLException {
info.setProcessStartDate(new Date());

try {
Map checksumMap = bitstreamStorageService.computeChecksum(context, info.getBitstream());
// 1. DB - Store not match
Bitstream bitstream = info.getBitstream();
Map<String, Object> checksumMap = bitstreamStorageService.computeChecksum(context, bitstream);
if (MapUtils.isNotEmpty(checksumMap)) {
info.setBitstreamFound(true);
if (checksumMap.containsKey("checksum")) {
Expand All @@ -255,10 +260,42 @@ protected void processBitstream(MostRecentChecksum info) throws SQLException {
if (checksumMap.containsKey("checksum_algorithm")) {
info.setChecksumAlgorithm(checksumMap.get("checksum_algorithm").toString());
}

// compare new checksum to previous checksum
info.setChecksumResult(compareChecksums(info.getExpectedChecksum(), info.getCurrentChecksum()));

} else {
info.setCurrentChecksum("");
info.setChecksumResult(getChecksumResultByCode(ChecksumResultCode.BITSTREAM_NOT_FOUND));
info.setToBeProcessed(false);
}

// 2. Store1 - Synchronized store 2 not match
// Check checksum of synchronized store
if (bitstream.getStoreNumber() != SYNCHRONIZED_STORES_NUMBER) {
return;
}
if (Objects.equals(ChecksumResultCode.CHECKSUM_NO_MATCH, info.getChecksumResult().getResultCode())) {
return;
}

Map<String, Object> syncStoreChecksumMap =
bitstreamStorageService.computeChecksumSpecStore(context, bitstream,
bitstreamStorageService.getSynchronizedStoreNumber(bitstream));
if (MapUtils.isNotEmpty(syncStoreChecksumMap)) {
String syncStoreChecksum = "";
if (checksumMap.containsKey("checksum")) {
syncStoreChecksum = syncStoreChecksumMap.get("checksum").toString();
}
// compare new checksum to previous checksum
ChecksumResult checksumResult = compareChecksums(info.getCurrentChecksum(), syncStoreChecksum);
// Do not override result with synchronization info if the checksums are not matching between
// DB and store
if (!Objects.equals(checksumResult.getResultCode(), ChecksumResultCode.CHECKSUM_NO_MATCH)) {
info.setChecksumResult(getChecksumResultByCode(ChecksumResultCode.CHECKSUM_SYNC_NO_MATCH));
}
}

// compare new checksum to previous checksum
info.setChecksumResult(compareChecksums(info.getExpectedChecksum(), info.getCurrentChecksum()));
} catch (IOException e) {
// bitstream located, but file missing from asset store
info.setChecksumResult(getChecksumResultByCode(ChecksumResultCode.BITSTREAM_NOT_FOUND));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,8 @@ public void addHistory(Context context, MostRecentChecksum mostRecentChecksum) t
if (mostRecentChecksum.getBitstream().isDeleted()) {
checksumResult = checksumResultService.findByCode(context, ChecksumResultCode.BITSTREAM_MARKED_DELETED);
} else {
checksumResult = checksumResultService.findByCode(context, ChecksumResultCode.CHECKSUM_MATCH);
checksumResult = checksumResultService.findByCode(context,
mostRecentChecksum.getChecksumResult().getResultCode());
}

checksumHistory.setResult(checksumResult);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,5 +24,6 @@ public enum ChecksumResultCode {
CHECKSUM_MATCH,
CHECKSUM_NO_MATCH,
CHECKSUM_PREV_NOT_FOUND,
CHECKSUM_ALGORITHM_INVALID
CHECKSUM_ALGORITHM_INVALID,
CHECKSUM_SYNC_NO_MATCH
}
Original file line number Diff line number Diff line change
Expand Up @@ -152,6 +152,7 @@ public int getBitstreamNotFoundReport(Context context, Date startDate, Date endD

osw.write("\n");
osw.write(msg("bitstream-not-found-report"));
osw.write(" ");
osw.write(applyDateFormatShort(startDate));
osw.write(" ");
osw.write(msg("date-range-to"));
Expand Down Expand Up @@ -230,6 +231,7 @@ public int getUncheckedBitstreamsReport(Context context, OutputStreamWriter osw)

osw.write("\n");
osw.write(msg("unchecked-bitstream-report"));
osw.write(" ");
osw.write(applyDateFormatShort(new Date()));
osw.write("\n\n\n");

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ public List<MostRecentChecksum> findByResultTypeInDateRange(Context context, Dat
criteriaQuery.where(criteriaBuilder.and(
criteriaBuilder.equal(mostRecentResult.get(ChecksumResult_.resultCode), resultCode),
criteriaBuilder.lessThanOrEqualTo(
mostRecentChecksumRoot.get(MostRecentChecksum_.processStartDate), startDate),
criteriaBuilder.greaterThan(mostRecentChecksumRoot.get(MostRecentChecksum_.processStartDate), endDate)
mostRecentChecksumRoot.get(MostRecentChecksum_.processStartDate), endDate),
criteriaBuilder.greaterThan(mostRecentChecksumRoot.get(MostRecentChecksum_.processStartDate), startDate)
)
);
List<Order> orderList = new LinkedList<>();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -338,8 +338,8 @@ public void updateLastModified(Context context, Bitstream bitstream) {
}

@Override
public List<Bitstream> findDeletedBitstreams(Context context) throws SQLException {
return bitstreamDAO.findDeletedBitstreams(context);
public List<Bitstream> findDeletedBitstreams(Context context, int limit, int offset) throws SQLException {
return bitstreamDAO.findDeletedBitstreams(context, limit, offset);
}

@Override
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@

import java.io.IOException;
import java.sql.SQLException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;

Expand All @@ -25,7 +25,7 @@
import org.dspace.core.Constants;
import org.dspace.core.Context;
import org.dspace.event.Event;
import org.dspace.storage.bitstore.DSBitStoreService;
import org.dspace.storage.bitstore.SyncBitstreamStorageServiceImpl;
import org.dspace.storage.bitstore.service.BitstreamStorageService;
import org.springframework.beans.factory.annotation.Autowired;

Expand All @@ -48,7 +48,7 @@ public class ClarinBitstreamServiceImpl implements ClarinBitstreamService {
private static final String CSA = "MD5";

@Autowired
private DSBitStoreService storeService;
private SyncBitstreamStorageServiceImpl syncBitstreamStorageService;
@Autowired
protected BitstreamDAO bitstreamDAO;
@Autowired
Expand Down Expand Up @@ -99,13 +99,12 @@ public boolean validation(Context context, Bitstream bitstream)
}
//get file from assetstore based on internal_id
//recalculate check fields
Map wantedMetadata = new HashMap();
wantedMetadata.put("size_bytes", null);
wantedMetadata.put("checksum", null);
wantedMetadata.put("checksum_algorithm", null);
Map checksumMap = storeService.about(bitstream, wantedMetadata);
List<String> wantedMetadata = List.of("size_bytes", "checksum", "checksum_algorithm");
Map<String, Object> receivedMetadata = syncBitstreamStorageService
.getStore(syncBitstreamStorageService.whichStoreNumber(bitstream))
.about(bitstream, wantedMetadata);
//check that new calculated values match the expected values
if (MapUtils.isEmpty(checksumMap) || !valid(bitstream, checksumMap)) {
if (MapUtils.isEmpty(receivedMetadata) || !valid(bitstream, receivedMetadata)) {
//an error occurred - expected and calculated values do not match
//delete all created data
bitstreamService.delete(context, bitstream);
Expand All @@ -126,7 +125,7 @@ public boolean validation(Context context, Bitstream bitstream)
* @param checksumMap calculated values
* @return bitstream values match with expected values
*/
private boolean valid(Bitstream bitstream, Map checksumMap) {
private boolean valid(Bitstream bitstream, Map<String, Object> checksumMap) {
if (!checksumMap.containsKey("checksum") || !checksumMap.containsKey("checksum_algorithm") ||
!checksumMap.containsKey("size_bytes")) {
log.error("Cannot validate of bitstream with id: " + bitstream.getID() +
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@ public interface BitstreamDAO extends DSpaceObjectLegacySupportDAO<Bitstream> {

public Iterator<Bitstream> findAll(Context context, int limit, int offset) throws SQLException;

public List<Bitstream> findDeletedBitstreams(Context context) throws SQLException;
public List<Bitstream> findDeletedBitstreams(Context context, int limit, int offset) throws SQLException;

public List<Bitstream> findDuplicateInternalIdentifier(Context context, Bitstream bitstream) throws SQLException;

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -41,13 +41,14 @@ protected BitstreamDAOImpl() {
}

@Override
public List<Bitstream> findDeletedBitstreams(Context context) throws SQLException {
public List<Bitstream> findDeletedBitstreams(Context context, int limit, int offset) throws SQLException {
CriteriaBuilder criteriaBuilder = getCriteriaBuilder(context);
CriteriaQuery criteriaQuery = getCriteriaQuery(criteriaBuilder, Bitstream.class);
Root<Bitstream> bitstreamRoot = criteriaQuery.from(Bitstream.class);
criteriaQuery.select(bitstreamRoot);
criteriaQuery.orderBy(criteriaBuilder.desc(bitstreamRoot.get(Bitstream_.ID)));
criteriaQuery.where(criteriaBuilder.equal(bitstreamRoot.get(Bitstream_.deleted), true));
return list(context, criteriaQuery, false, Bitstream.class, -1, -1);
return list(context, criteriaQuery, false, Bitstream.class, limit, offset);

}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -183,7 +183,7 @@ public InputStream retrieve(Context context, Bitstream bitstream)
* @return a list of all bitstreams that have been "deleted"
* @throws SQLException if database error
*/
public List<Bitstream> findDeletedBitstreams(Context context) throws SQLException;
public List<Bitstream> findDeletedBitstreams(Context context, int limit, int offset) throws SQLException;


/**
Expand Down
6 changes: 4 additions & 2 deletions dspace-api/src/main/java/org/dspace/core/Email.java
Original file line number Diff line number Diff line change
Expand Up @@ -314,6 +314,8 @@ public void send() throws MessagingException, IOException {
message.addRecipient(Message.RecipientType.TO, new InternetAddress(
i.next()));
}
// Get headers defined by the template.
String[] templateHeaders = config.getArrayProperty("mail.message.headers");

// Format the mail message body
VelocityEngine templateEngine = new VelocityEngine();
Expand All @@ -334,6 +336,7 @@ public void send() throws MessagingException, IOException {
repo.putStringResource(contentName, content);
// Turn content into a template.
template = templateEngine.getTemplate(contentName);
templateHeaders = new String[] {};
}

StringWriter writer = new StringWriter();
Expand All @@ -351,8 +354,7 @@ public void send() throws MessagingException, IOException {
message.setSentDate(date);
message.setFrom(new InternetAddress(from));

// Get headers defined by the template.
for (String headerName : config.getArrayProperty("mail.message.headers")) {
for (String headerName : templateHeaders) {
String headerValue = (String) vctx.get(headerName);
if ("subject".equalsIgnoreCase(headerName)) {
if (null != headerValue) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@
import java.security.DigestInputStream;
import java.security.MessageDigest;
import java.security.NoSuchAlgorithmException;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

import org.apache.commons.lang3.StringUtils;
Expand Down Expand Up @@ -153,22 +155,24 @@ protected boolean isLonger(String internalId, int endIndex) {
* Retrieves a map of useful metadata about the File (size, checksum, modified)
*
* @param file The File to analyze
* @param attrs The map where we are storing values
* @param attrs The list of requested metadata values
* @return Map of updated metadatas / attrs
* @throws IOException
*/
public Map about(File file, Map attrs) throws IOException {
public Map<String, Object> about(File file, List<String> attrs) throws IOException {

Map<String, Object> metadata = new HashMap<String, Object>();

try {
if (file != null && file.exists()) {
this.putValueIfExistsKey(attrs, SIZE_BYTES, file.length());
if (attrs.containsKey(CHECKSUM)) {
attrs.put(CHECKSUM, Utils.toHex(this.generateChecksumFrom(file)));
attrs.put(CHECKSUM_ALGORITHM, CSA);
this.putValueIfExistsKey(attrs, metadata, SIZE_BYTES, file.length());
if (attrs.contains(CHECKSUM)) {
metadata.put(CHECKSUM, Utils.toHex(this.generateChecksumFrom(file)));
metadata.put(CHECKSUM_ALGORITHM, CSA);
}
this.putValueIfExistsKey(attrs, MODIFIED, String.valueOf(file.lastModified()));
return attrs;
this.putValueIfExistsKey(attrs, metadata, MODIFIED, String.valueOf(file.lastModified()));
}
return null;
return metadata;
} catch (Exception e) {
log.error("about( FilePath: " + file.getAbsolutePath() + ", Map: " + attrs.toString() + ")", e);
throw new IOException(e);
Expand Down Expand Up @@ -204,13 +208,9 @@ private byte[] generateChecksumFrom(FileInputStream fis) throws IOException, NoS
}
}

protected void putValueIfExistsKey(Map attrs, String key, Object value) {
this.putEntryIfExistsKey(attrs, key, Map.entry(key, value));
}

protected void putEntryIfExistsKey(Map attrs, String key, Map.Entry entry) {
if (attrs.containsKey(key)) {
attrs.put(entry.getKey(), entry.getValue());
protected void putValueIfExistsKey(List<String> attrs, Map<String, Object> metadata, String key, Object value) {
if (attrs.contains(key)) {
metadata.put(key, value);
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

import java.io.IOException;
import java.io.InputStream;
import java.util.List;
import java.util.Map;

import org.dspace.content.Bitstream;
Expand Down Expand Up @@ -62,13 +63,13 @@ public interface BitStoreService {
* Obtain technical metadata about an asset in the asset store.
*
* @param bitstream The bitstream to describe
* @param attrs A Map whose keys consist of desired metadata fields
* @param attrs A List of desired metadata fields
* @return attrs
* A Map with key/value pairs of desired metadata
* If file not found, then return null
* @throws java.io.IOException If a problem occurs while obtaining metadata
*/
public Map about(Bitstream bitstream, Map attrs) throws IOException;
public Map<String, Object> about(Bitstream bitstream, List<String> attrs) throws IOException;

/**
* Remove an asset from the asset store.
Expand Down
Loading

0 comments on commit bf4f503

Please sign in to comment.