Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core/src/main/scala/kafka/server/KafkaApis.scala
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,7 @@ class KafkaApis(val requestChannel: RequestChannel,
val describeTopicPartitionsRequestHandler = new DescribeTopicPartitionsRequestHandler(
metadataCache, authHelper, config)

val inklessTopicMetadataTransformer = inklessSharedState.map(s => new InklessTopicMetadataTransformer(s.metadata()))
val inklessTopicMetadataTransformer = inklessSharedState.map(s => new InklessTopicMetadataTransformer(time, s.metadata()))

def close(): Unit = {
aclApis.close()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
import org.apache.kafka.common.message.DescribeTopicPartitionsResponseData;
import org.apache.kafka.common.message.MetadataResponseData;
import org.apache.kafka.common.network.ListenerName;
import org.apache.kafka.common.utils.Time;
import org.apache.kafka.metadata.LeaderAndIsr;

import java.util.Collections;
Expand All @@ -33,11 +34,13 @@
import io.aiven.inkless.control_plane.MetadataView;

public class InklessTopicMetadataTransformer {
private final Time time;
private final MetadataView metadataView;

private final AtomicInteger roundRobinCounter = new AtomicInteger();

public InklessTopicMetadataTransformer(final MetadataView metadataView) {
public InklessTopicMetadataTransformer(final Time time, final MetadataView metadataView) {
this.time = Objects.requireNonNull(time, "time cannot be null");
this.metadataView = Objects.requireNonNull(metadataView, "metadataView cannot be null");
}

Expand All @@ -52,10 +55,12 @@ public void transformClusterMetadata(
Objects.requireNonNull(topicMetadata, "topicMetadata cannot be null");

final int leaderForInklessPartitions = selectLeaderForInklessPartitions(listenerName, clientId);
boolean hasInklessTopics = false;
for (final var topic : topicMetadata) {
if (!metadataView.isInklessTopic(topic.name())) {
continue;
}
hasInklessTopics = true;
for (final var partition : topic.partitions()) {
partition.setLeaderId(leaderForInklessPartitions);
final List<Integer> list = List.of(leaderForInklessPartitions);
Expand All @@ -65,6 +70,11 @@ public void transformClusterMetadata(
partition.setLeaderEpoch(LeaderAndIsr.INITIAL_LEADER_EPOCH);
}
}
if (hasInklessTopics) {
// Introduce artificial latency to avoid a race condition between the metadata update and the producer
// causing OutOfOrderSequenceException.
time.sleep(500);
Comment on lines +74 to +76
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not sure it's a good idea for several reasons:

  1. I don't see how waiting for fixed delay of 500 ms may solve the problem... As produce and metadata updates happen asynchronously, they may interleave in arbitrary order with or without the sleep.
  2. Blocking the handler thread isn't great.

I think the problem is theoretically possible on classic Kafka, too, it's juts unlikely because of more leadership stability + the NOT_LEADER_OR_FOLLOWER error. How bad would be it if we just do nothing and rely on existing recovery mechanism?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

As produce and metadata updates happen asynchronously, they may interleave in arbitrary order with or without the sleep.

From what I found, metadata updates are not async (e.g. https://github.com/apache/kafka/blob/3c1f965c60789dcc8ee14ebabcbb4e16ebffc5ee/clients/src/main/java/org/apache/kafka/clients/NetworkClient.java#L642) hence 500ms (choose arbitrarily but aiming to cope with a rotation on the broker side, default 250ms) would give room to avoid the race condition; though I'd agree any other implementation could potentially handle this update asynchronously and this solution may not be enough.

I think the problem is theoretically possible on classic Kafka, too

I don't think it is with partition leaders. The producer state is cached on the leader and if leadership changes, it starts accepting requests after updating the state.

How bad would be it if we just do nothing and rely on existing recovery mechanism?

OutOfOrderSequence errors are retried: https://github.com/apache/kafka/blob/6f783f85362071f82da3dcef706c7e6b89b86c2a/clients/src/main/java/org/apache/kafka/clients/producer/internals/Sender.java#L829-L834
We could just document that this error is expected and retried, and users could safely ignore it on their logs.
Another side effect we have observed is that retries lead to latency spikes (as the retry is handled within producer machinery so from the request time it just takes longer) -- though again this could be documented.

}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,8 @@
import org.apache.kafka.common.message.MetadataResponseData.MetadataResponseTopic;
import org.apache.kafka.common.network.ListenerName;
import org.apache.kafka.common.security.auth.SecurityProtocol;
import org.apache.kafka.common.utils.MockTime;
import org.apache.kafka.common.utils.Time;

import org.junit.jupiter.api.BeforeEach;
import org.junit.jupiter.api.Nested;
Expand Down Expand Up @@ -62,16 +64,20 @@ class InklessTopicMetadataTransformerTest {
static final Uuid TOPIC_CLASSIC_ID = new Uuid(456, 456);
static final ListenerName LISTENER_NAME = ListenerName.forSecurityProtocol(SecurityProtocol.PLAINTEXT);

Time time = new MockTime();
@Mock
MetadataView metadataView;

@Test
void nulls() {
assertThatThrownBy(() -> new InklessTopicMetadataTransformer(null))
assertThatThrownBy(() -> new InklessTopicMetadataTransformer(null, metadataView))
.isInstanceOf(NullPointerException.class)
.hasMessage("time cannot be null");
assertThatThrownBy(() -> new InklessTopicMetadataTransformer(time, null))
.isInstanceOf(NullPointerException.class)
.hasMessage("metadataView cannot be null");

final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);
assertThatThrownBy(() -> transformer.transformClusterMetadata(LISTENER_NAME, "x", null))
.isInstanceOf(NullPointerException.class)
.hasMessage("topicMetadata cannot be null");
Expand All @@ -94,7 +100,7 @@ void setup() {
@NullSource
@ValueSource(strings = {"inkless_az=az1", "x=y", ""})
void clusterMetadata(final String clientId) {
final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);

final List<MetadataResponseTopic> topicMetadata = List.of();
transformer.transformClusterMetadata(LISTENER_NAME, clientId, topicMetadata);
Expand All @@ -105,7 +111,7 @@ void clusterMetadata(final String clientId) {
@NullSource
@ValueSource(strings = {"inkless_az=az1", "x=y", ""})
void describeTopicResponse(final String clientId) {
final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);

final DescribeTopicPartitionsResponseData describeResponse = new DescribeTopicPartitionsResponseData();
transformer.transformDescribeTopicResponse(LISTENER_NAME, clientId, describeResponse);
Expand Down Expand Up @@ -186,7 +192,7 @@ void clusterMetadata(final String clientAZ, final int expectedLeaderId1, final i
inklessTopicMetadata.get(),
classicTopicMetadata.get()
);
final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);

transformer.transformClusterMetadata(LISTENER_NAME, "inkless_az=" + clientAZ, topicMetadata);

Expand Down Expand Up @@ -279,7 +285,7 @@ void describeTopicResponse(final String clientAZ, final int expectedLeaderId1, f
inklessTopicMetadata.get(),
classicTopicMetadata.get()
).iterator()));
final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);

transformer.transformDescribeTopicResponse(LISTENER_NAME, "inkless_az=" + clientAZ, describeResponse);

Expand Down Expand Up @@ -333,7 +339,7 @@ void clusterMetadata() {
));

final List<MetadataResponseTopic> topicMetadata = List.of(inklessTopicMetadata.get());
final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);

transformer.transformClusterMetadata(LISTENER_NAME, "inkless_az=az0", topicMetadata);
final var expectedInklessTopicMetadata = inklessTopicMetadata.get();
Expand Down Expand Up @@ -368,7 +374,7 @@ void describeTopicResponse() {
))
).iterator()));

final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);

final DescribeTopicPartitionsResponseData describeResponse = describeResponseSupplier.get();
transformer.transformDescribeTopicResponse(LISTENER_NAME, "inkless_az=az0", describeResponse);
Expand Down Expand Up @@ -413,7 +419,7 @@ void clusterMetadata() {
));

final List<MetadataResponseTopic> topicMetadata = List.of(inklessTopicMetadata.get());
final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);

transformer.transformClusterMetadata(LISTENER_NAME, null, topicMetadata);
final var expectedInklessTopicMetadata = inklessTopicMetadata.get();
Expand Down Expand Up @@ -448,7 +454,7 @@ void describeTopicResponse() {
))
).iterator()));

final var transformer = new InklessTopicMetadataTransformer(metadataView);
final var transformer = new InklessTopicMetadataTransformer(time, metadataView);
final DescribeTopicPartitionsResponseData describeResponse = describeResponseSupplier.get();

transformer.transformDescribeTopicResponse(LISTENER_NAME, null, describeResponse);
Expand Down