-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Split api layer from file reader (#365)
The update makes updates to the SourceRecordIterator to remove the requirement for a S3Client and specific S3 knowledge from the iterator. The iterator will now also call for more files after the initial set of files has been processed. The only remaining work to be done is to remove the construction of the S3Object into an iterator from the SourceRecordIterator in a follow up PR which will allow it to be completely re-useable. --------- Signed-off-by: dependabot[bot] <[email protected]> Signed-off-by: Aindriu Lavelle <[email protected]> Co-authored-by: dependabot[bot] <49699333+dependabot[bot]@users.noreply.github.com>
- Loading branch information
1 parent
d2b8e55
commit 076e424
Showing
10 changed files
with
223 additions
and
190 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
133 changes: 133 additions & 0 deletions
133
...rce-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/AWSV2SourceClient.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,133 @@ | ||
/* | ||
* Copyright 2024 Aiven Oy | ||
* | ||
* Licensed under the Apache License, Version 2.0 (the "License"); | ||
* you may not use this file except in compliance with the License. | ||
* You may obtain a copy of the License at | ||
* | ||
* http://www.apache.org/licenses/LICENSE-2.0 | ||
* | ||
* Unless required by applicable law or agreed to in writing, software | ||
* distributed under the License is distributed on an "AS IS" BASIS, | ||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | ||
* See the License for the specific language governing permissions and | ||
* limitations under the License. | ||
*/ | ||
|
||
package io.aiven.kafka.connect.s3.source.utils; | ||
|
||
import java.util.HashSet; | ||
import java.util.Iterator; | ||
import java.util.Objects; | ||
import java.util.Set; | ||
import java.util.function.Predicate; | ||
import java.util.stream.Stream; | ||
|
||
import io.aiven.kafka.connect.s3.source.config.S3ClientFactory; | ||
import io.aiven.kafka.connect.s3.source.config.S3SourceConfig; | ||
|
||
import com.amazonaws.services.s3.AmazonS3; | ||
import com.amazonaws.services.s3.model.ListObjectsV2Request; | ||
import com.amazonaws.services.s3.model.S3Object; | ||
import com.amazonaws.services.s3.model.S3ObjectSummary; | ||
import org.codehaus.plexus.util.StringUtils; | ||
|
||
/** | ||
* Called AWSV2SourceClient as this source client implements the V2 version of the aws client library. Handles all calls | ||
* and authentication to AWS and returns useable objects to the SourceRecordIterator. | ||
*/ | ||
public class AWSV2SourceClient { | ||
|
||
public static final int PAGE_SIZE_FACTOR = 2; | ||
private final S3SourceConfig s3SourceConfig; | ||
private final AmazonS3 s3Client; | ||
private final String bucketName; | ||
|
||
private Predicate<S3ObjectSummary> filterPredicate = summary -> summary.getSize() > 0; | ||
private final Set<String> failedObjectKeys; | ||
|
||
/** | ||
* @param s3SourceConfig | ||
* configuration for Source connector | ||
* @param failedObjectKeys | ||
* all objectKeys which have already been tried but have been unable to process. | ||
*/ | ||
public AWSV2SourceClient(final S3SourceConfig s3SourceConfig, final Set<String> failedObjectKeys) { | ||
this.s3SourceConfig = s3SourceConfig; | ||
final S3ClientFactory s3ClientFactory = new S3ClientFactory(); | ||
this.s3Client = s3ClientFactory.createAmazonS3Client(s3SourceConfig); | ||
this.bucketName = s3SourceConfig.getAwsS3BucketName(); | ||
this.failedObjectKeys = new HashSet<>(failedObjectKeys); | ||
} | ||
|
||
/** | ||
* Valid for testing | ||
* | ||
* @param s3Client | ||
* amazonS3Client | ||
* @param s3SourceConfig | ||
* configuration for Source connector | ||
* @param failedObjectKeys | ||
* all objectKeys which have already been tried but have been unable to process. | ||
*/ | ||
AWSV2SourceClient(final AmazonS3 s3Client, final S3SourceConfig s3SourceConfig, | ||
final Set<String> failedObjectKeys) { | ||
this.s3SourceConfig = s3SourceConfig; | ||
this.s3Client = s3Client; | ||
this.bucketName = s3SourceConfig.getAwsS3BucketName(); | ||
this.failedObjectKeys = new HashSet<>(failedObjectKeys); | ||
} | ||
|
||
public Iterator<String> getListOfObjectKeys(final String startToken) { | ||
final ListObjectsV2Request request = new ListObjectsV2Request().withBucketName(bucketName) | ||
.withMaxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR); | ||
|
||
if (StringUtils.isNotBlank(startToken)) { | ||
request.withStartAfter(startToken); | ||
} | ||
|
||
final Stream<String> s3ObjectKeyStream = Stream | ||
.iterate(s3Client.listObjectsV2(request), Objects::nonNull, response -> { | ||
// This is called every time next() is called on the iterator. | ||
if (response.isTruncated()) { | ||
return s3Client.listObjectsV2(new ListObjectsV2Request().withBucketName(bucketName) | ||
.withMaxKeys(s3SourceConfig.getS3ConfigFragment().getFetchPageSize() * PAGE_SIZE_FACTOR) | ||
.withContinuationToken(response.getNextContinuationToken())); | ||
} else { | ||
return null; | ||
} | ||
|
||
}) | ||
.flatMap(response -> response.getObjectSummaries() | ||
.stream() | ||
.filter(filterPredicate) | ||
.filter(objectSummary -> assignObjectToTask(objectSummary.getKey())) | ||
.filter(objectSummary -> !failedObjectKeys.contains(objectSummary.getKey()))) | ||
.map(S3ObjectSummary::getKey); | ||
return s3ObjectKeyStream.iterator(); | ||
} | ||
|
||
public S3Object getObject(final String objectKey) { | ||
return s3Client.getObject(bucketName, objectKey); | ||
} | ||
|
||
public void addFailedObjectKeys(final String objectKey) { | ||
this.failedObjectKeys.add(objectKey); | ||
} | ||
|
||
public void setFilterPredicate(final Predicate<S3ObjectSummary> predicate) { | ||
filterPredicate = predicate; | ||
} | ||
|
||
private boolean assignObjectToTask(final String objectKey) { | ||
final int maxTasks = Integer.parseInt(s3SourceConfig.originals().get("tasks.max").toString()); | ||
final int taskId = Integer.parseInt(s3SourceConfig.originals().get("task.id").toString()) % maxTasks; | ||
final int taskAssignment = Math.floorMod(objectKey.hashCode(), maxTasks); | ||
return taskAssignment == taskId; | ||
} | ||
|
||
public void shutdown() { | ||
s3Client.shutdown(); | ||
} | ||
|
||
} |
80 changes: 0 additions & 80 deletions
80
s3-source-connector/src/main/java/io/aiven/kafka/connect/s3/source/utils/FileReader.java
This file was deleted.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.