Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initiate S3 Multi-part upload on receiving first event #318

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,11 @@
import java.time.ZonedDateTime;
import java.time.format.DateTimeFormatter;
import java.util.Collection;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Objects;
import java.util.stream.Collectors;

import org.apache.kafka.clients.consumer.OffsetAndMetadata;
import org.apache.kafka.common.TopicPartition;
Expand All @@ -40,6 +42,7 @@
import io.aiven.kafka.connect.common.grouper.RecordGrouper;
import io.aiven.kafka.connect.common.grouper.RecordGrouperFactory;
import io.aiven.kafka.connect.common.output.OutputWriter;
import io.aiven.kafka.connect.common.templating.Template;
import io.aiven.kafka.connect.common.templating.VariableTemplatePart;
import io.aiven.kafka.connect.s3.config.AwsCredentialProviderFactory;
import io.aiven.kafka.connect.s3.config.S3SinkConfig;
Expand All @@ -53,17 +56,21 @@
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

@SuppressWarnings("PMD.ExcessiveImports")
@SuppressWarnings({ "PMD.ExcessiveImports", "PMD.TooManyMethods" })
public final class S3SinkTask extends SinkTask {

private static final Logger LOGGER = LoggerFactory.getLogger(AivenKafkaConnectS3SinkConnector.class);
private static final Logger LOGGER = LoggerFactory.getLogger(S3SinkTask.class);

private RecordGrouper recordGrouper;

private S3SinkConfig config;

private AmazonS3 s3Client;

private Map<String, OutputWriter> writers;

private boolean isKeyRecordGrouper;

AwsCredentialProviderFactory credentialFactory = new AwsCredentialProviderFactory();

@SuppressWarnings("PMD.UnnecessaryConstructor") // required by Connect
Expand All @@ -76,6 +83,8 @@ public void start(final Map<String, String> props) {
Objects.requireNonNull(props, "props hasn't been set");
config = new S3SinkConfig(props);
s3Client = createAmazonS3Client(config);
writers = new HashMap<>();
isKeyRecordGrouper = isOfTypeKeyRecordGrouper(config.getFilenameTemplate());
try {
recordGrouper = RecordGrouperFactory.newRecordGrouper(config);
} catch (final Exception e) { // NOPMD AvoidCatchingGenericException
Expand All @@ -86,6 +95,20 @@ public void start(final Map<String, String> props) {
}
}

/**
* This determines if the file is key based, and possible to change a single file multiple times per flush or if
* it's a roll over file which at each flush is reset.
Comment on lines +99 to +100
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you explain more about this? What is key based grouping, and why does it mutate the file?

*
* @param fileNameTemplate
* the format type to output files in supplied in the configuration
* @return true if is of type RecordGrouperFactory.KEY_RECORD or RecordGrouperFactory.KEY_TOPIC_PARTITION_RECORD
*/
private boolean isOfTypeKeyRecordGrouper(final Template fileNameTemplate) {
return RecordGrouperFactory.KEY_RECORD.equals(RecordGrouperFactory.resolveRecordGrouperType(fileNameTemplate))
|| RecordGrouperFactory.KEY_TOPIC_PARTITION_RECORD
.equals(RecordGrouperFactory.resolveRecordGrouperType(fileNameTemplate));
}

private AmazonS3 createAmazonS3Client(final S3SinkConfig config) {
final var awsEndpointConfig = newEndpointConfiguration(this.config);
final var clientConfig = PredefinedClientConfigurations.defaultConfig()
Expand All @@ -110,39 +133,138 @@ public void put(final Collection<SinkRecord> records) {
Objects.requireNonNull(records, "records cannot be null");
LOGGER.info("Processing {} records", records.size());
records.forEach(recordGrouper::put);
if (!isKeyRecordGrouper) {
recordGrouper.records().forEach((filename, groupedRecords) -> writeToS3(filename, groupedRecords, records));
}
}

/**
* Flush is used alongside the KeyRecordGroupers to initate and complete file writes to S3. When not using a key
* record grouper, the S3 upload will be initiated by the put command and flush will be used to write the files and
* roll over the files/
*
* @param offsets
* the latest offset sent to put and that is now ready to be flushed.
*/
@Override
public void flush(final Map<TopicPartition, OffsetAndMetadata> offsets) {
try {
recordGrouper.records().forEach(this::flushFile);
} finally {
if (isKeyRecordGrouper) {
try {
recordGrouper.records().forEach(this::flushToS3);
} finally {
recordGrouper.clear();
}
} else {
// On Flush Get Active writers
final Collection<OutputWriter> activeWriters = writers.values();
// Clear recordGrouper so it restarts OFFSET HEADS etc and on next put new writers will be created.
recordGrouper.clear();
// Close
activeWriters.forEach(writer -> {
try {
// Close active writers && remove from writers Map
// Calling close will write anything in the buffer before closing and complete the S3 multi part
// upload
writer.close();
// Remove once closed
writers.remove(writer);
} catch (IOException e) {
throw new ConnectException(e);
}
});
}

}

private void flushFile(final String filename, final List<SinkRecord> records) {
Objects.requireNonNull(records, "records cannot be null");
if (records.isEmpty()) {
return;
/**
* getOutputWriter is used to check if an existing compatible OutputWriter exists and if not create one and return
* it to the caller.
*
* @param filename
* used to write to S3
* @param sinkRecord
* a sinkRecord used to create a new S3OutputStream
* @return correct OutputWriter for writing a particular record to S3
*/
private OutputWriter getOutputWriter(final String filename, final SinkRecord sinkRecord) {
final String fileNameTemplate = getFileNameTemplate(filename, sinkRecord);

if (writers.get(fileNameTemplate) == null) {
final var out = newStreamFor(filename, sinkRecord);
try {
writers.put(fileNameTemplate,
OutputWriter.builder()
.withCompressionType(config.getCompressionType())
.withExternalProperties(config.originalsStrings())
.withOutputFields(config.getOutputFields())
.withEnvelopeEnabled(config.envelopeEnabled())
.build(out, config.getFormatType()));
} catch (IOException e) {
throw new ConnectException(e);
}
}
return writers.get(fileNameTemplate);
}

/**
*
* @param filename
* the name of the file in S3 to be written to
* @param records
* all records in this record grouping, including those already written to S3
*/
private void writeToS3(final String filename, final List<SinkRecord> records,
final Collection<SinkRecord> recordToBeWritten) {
final SinkRecord sinkRecord = records.get(0);
try (var out = newStreamFor(filename, sinkRecord);
var outputWriter = OutputWriter.builder()
.withCompressionType(config.getCompressionType())
.withExternalProperties(config.originalsStrings())
.withOutputFields(config.getOutputFields())
.withEnvelopeEnabled(config.envelopeEnabled())
.build(out, config.getFormatType())) {
outputWriter.writeRecords(records);
} catch (final IOException e) {
// This writer is being left open until a flush occurs.
final OutputWriter writer; // NOPMD CloseResource
try {
writer = getOutputWriter(filename, sinkRecord);
// Record Grouper returns all records for that filename, all we want is the new batch of records to be added
// to the multi part upload.
writer.writeRecords(records.stream().filter(recordToBeWritten::contains).collect(Collectors.toList()));

} catch (IOException e) {
throw new ConnectException(e);
}

}

/**
* For Key record grouper the file is written just once to reduce the number of calls to S3 to a minimum. Each file
* contains one record and is written once with the latest record when flush is called
*
* @param filename
* the name of the file in S3 to be written to
* @param records
* all records in this record grouping, including those already written to S3
*/
private void flushToS3(final String filename, final List<SinkRecord> records) {
final SinkRecord sinkRecord = records.get(0);
try (var writer = getOutputWriter(filename, sinkRecord)) {
// For Key based files Record Grouper returns only one record for that filename
// to the multi part upload.
writer.writeRecords(records);
writers.remove(filename, writer);
} catch (IOException e) {
throw new ConnectException(e);
}

}

@Override
public void stop() {
if (!isKeyRecordGrouper) {
writers.forEach((k, v) -> {
try {
v.close();
} catch (IOException e) {
throw new ConnectException(e);
}
});
}
s3Client.shutdown();

LOGGER.info("Stop S3 Sink Task");
}

Expand All @@ -152,11 +274,15 @@ public String version() {
}

private OutputStream newStreamFor(final String filename, final SinkRecord record) {
final var fullKey = config.usesFileNameTemplate() ? filename : oldFullKey(record);
final var fullKey = getFileNameTemplate(filename, record);
return new S3OutputStream(config.getAwsS3BucketName(), fullKey, config.getAwsS3PartSize(), s3Client,
config.getServerSideEncryptionAlgorithmName());
}

private String getFileNameTemplate(final String filename, final SinkRecord record) {
return config.usesFileNameTemplate() ? filename : oldFullKey(record);
}

private EndpointConfiguration newEndpointConfiguration(final S3SinkConfig config) {
if (Objects.isNull(config.getAwsS3EndPoint())) {
return null;
Expand Down
Loading
Loading