Skip to content

Commit

Permalink
Avoid accessing a null ClientConnection instance (#317)
Browse files Browse the repository at this point in the history
### Motivation

We observed server null `ClientConnection` accesses in test environment.
See the `this=0x0` outputs in the following two typical stacks.

```
#8  bytesWritten (this=0xb8, size=371) at lib/SharedBuffer.h:166
#9  pulsar::ClientConnection::handleRead (this=0x0, err=..., bytesTransferred=371, minReadSize=4) at lib/ClientConnection.cc:609
```

```
#12 0x00007f33202933d2 in unique_lock (__m=..., this=0x7f3311c82800) at /opt/rh/devtoolset-7/root/usr/include/c++/7/bits/std_mutex.h:197
#13 pulsar::ClientConnection::sendPendingCommands (this=0x0) at lib/ClientConnection.cc:1071
#14 0x00007f3320293d2d in pulsar::ClientConnection::handleSendPair (this=0x0, err=...) at lib/ClientConnection.cc:1066
```

Though `shared_from_this()` is always passed to the `std::bind`
function, when the method of `ClientConnection` is called, the pointer
is still `null`.

### Modifications

First, replace all `std::bind` calls with the lambda expression that
catches `std::weak_ptr<ClientConnection>` and perform null checks
explicitly on the value returned by the `lock()` method.

Since now all asio callbacks don't hold a `shared_ptr`, the owner of
the `ClientConnection` object should be `ConnectionPool`, i.e. the pool
maintains some connections, while all asio callbacks use `weak_ptr` to
test if the connection is present.

Second, make `ClientConnection::getConnection` return `shared_ptr`
rather than `weak_ptr` so that the caller side does not need to check if
`lock()` returns null in the callback of this future.

We cannot make `ConnectionPool::getConnectionAsync` return `shared_ptr`
because it could return the future of `connectPromise_`, which is hold
by `ClientConnection` itself. We should avoid holding a `shared_ptr` of
`ClientConnection` because its owner is `ConnectionPool`.
  • Loading branch information
BewareMyPower authored Sep 20, 2023
1 parent ba5902a commit b35ae1a
Show file tree
Hide file tree
Showing 12 changed files with 319 additions and 170 deletions.
12 changes: 10 additions & 2 deletions lib/BinaryProtoLookupService.cc
Original file line number Diff line number Diff line change
Expand Up @@ -122,8 +122,12 @@ void BinaryProtoLookupService::sendPartitionMetadataLookupRequest(const std::str
promise->setFailed(result);
return;
}
auto conn = clientCnx.lock();
if (!conn) {
promise->setFailed(ResultConnectError);
return;
}
LookupDataResultPromisePtr lookupPromise = std::make_shared<LookupDataResultPromise>();
ClientConnectionPtr conn = clientCnx.lock();
uint64_t requestId = newRequestId();
conn->newPartitionedMetadataLookup(topicName, requestId, lookupPromise);
lookupPromise->getFuture().addListener(std::bind(&BinaryProtoLookupService::handlePartitionMetadataLookup,
Expand Down Expand Up @@ -212,7 +216,11 @@ void BinaryProtoLookupService::sendGetTopicsOfNamespaceRequest(const std::string
return;
}

ClientConnectionPtr conn = clientCnx.lock();
auto conn = clientCnx.lock();
if (!conn) {
promise->setFailed(ResultConnectError);
return;
}
uint64_t requestId = newRequestId();
LOG_DEBUG("sendGetTopicsOfNamespaceRequest. requestId: " << requestId << " nsName: " << nsName);
conn->newGetTopicsOfNamespace(nsName, mode, requestId)
Expand Down
285 changes: 206 additions & 79 deletions lib/ClientConnection.cc

Large diffs are not rendered by default.

11 changes: 9 additions & 2 deletions lib/ClientConnection.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ using TcpResolverPtr = std::shared_ptr<boost::asio::ip::tcp::resolver>;
class ExecutorService;
using ExecutorServicePtr = std::shared_ptr<ExecutorService>;

class ConnectionPool;
class ClientConnection;
typedef std::shared_ptr<ClientConnection> ClientConnectionPtr;
typedef std::weak_ptr<ClientConnection> ClientConnectionWeakPtr;
Expand Down Expand Up @@ -127,16 +128,21 @@ class PULSAR_PUBLIC ClientConnection : public std::enable_shared_from_this<Clien
*/
ClientConnection(const std::string& logicalAddress, const std::string& physicalAddress,
ExecutorServicePtr executor, const ClientConfiguration& clientConfiguration,
const AuthenticationPtr& authentication, const std::string& clientVersion);
const AuthenticationPtr& authentication, const std::string& clientVersion,
ConnectionPool& pool);
~ClientConnection();

#if __cplusplus < 201703L
std::weak_ptr<ClientConnection> weak_from_this() noexcept { return shared_from_this(); }
#endif

/*
* starts tcp connect_async
* @return future<ConnectionPtr> which is not yet set
*/
void tcpConnectAsync();

void close(Result result = ResultConnectError);
void close(Result result = ResultConnectError, bool detach = true);

bool isClosed() const;

Expand Down Expand Up @@ -383,6 +389,7 @@ class PULSAR_PUBLIC ClientConnection : public std::enable_shared_from_this<Clien
bool isTlsAllowInsecureConnection_ = false;

const std::string clientVersion_;
ConnectionPool& pool_;
friend class PulsarFriend;

void closeSocket();
Expand Down
11 changes: 8 additions & 3 deletions lib/ClientImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -517,8 +517,8 @@ void ClientImpl::handleConsumerCreated(Result result, ConsumerImplBaseWeakPtr co
}
}

Future<Result, ClientConnectionWeakPtr> ClientImpl::getConnection(const std::string& topic) {
Promise<Result, ClientConnectionWeakPtr> promise;
Future<Result, ClientConnectionPtr> ClientImpl::getConnection(const std::string& topic) {
Promise<Result, ClientConnectionPtr> promise;

const auto topicNamePtr = TopicName::get(topic);
if (!topicNamePtr) {
Expand All @@ -537,7 +537,12 @@ Future<Result, ClientConnectionWeakPtr> ClientImpl::getConnection(const std::str
pool_.getConnectionAsync(data.logicalAddress, data.physicalAddress)
.addListener([promise](Result result, const ClientConnectionWeakPtr& weakCnx) {
if (result == ResultOk) {
promise.setValue(weakCnx);
auto cnx = weakCnx.lock();
if (cnx) {
promise.setValue(cnx);
} else {
promise.setFailed(ResultConnectError);
}
} else {
promise.setFailed(result);
}
Expand Down
4 changes: 2 additions & 2 deletions lib/ClientImpl.h
Original file line number Diff line number Diff line change
Expand Up @@ -50,7 +50,7 @@ class ConsumerImplBase;
typedef std::weak_ptr<ConsumerImplBase> ConsumerImplBaseWeakPtr;

class ClientConnection;
using ClientConnectionWeakPtr = std::weak_ptr<ClientConnection>;
using ClientConnectionPtr = std::shared_ptr<ClientConnection>;

class LookupService;
using LookupServicePtr = std::shared_ptr<LookupService>;
Expand Down Expand Up @@ -96,7 +96,7 @@ class ClientImpl : public std::enable_shared_from_this<ClientImpl> {

void getPartitionsForTopicAsync(const std::string& topic, GetPartitionsCallback callback);

Future<Result, ClientConnectionWeakPtr> getConnection(const std::string& topic);
Future<Result, ClientConnectionPtr> getConnection(const std::string& topic);

void closeAsync(CloseCallback callback);
void shutdown();
Expand Down
32 changes: 21 additions & 11 deletions lib/ConnectionPool.cc
Original file line number Diff line number Diff line change
Expand Up @@ -48,12 +48,13 @@ bool ConnectionPool::close() {
return false;
}

std::unique_lock<std::mutex> lock(mutex_);
std::unique_lock<std::recursive_mutex> lock(mutex_);
if (poolConnections_) {
for (auto cnxIt = pool_.begin(); cnxIt != pool_.end(); cnxIt++) {
ClientConnectionPtr cnx = cnxIt->second.lock();
auto& cnx = cnxIt->second;
if (cnx) {
cnx->close(ResultDisconnected);
// The 2nd argument is false because removing a value during the iteration will cause segfault
cnx->close(ResultDisconnected, false);
}
}
pool_.clear();
Expand All @@ -69,22 +70,22 @@ Future<Result, ClientConnectionWeakPtr> ConnectionPool::getConnectionAsync(
return promise.getFuture();
}

std::unique_lock<std::mutex> lock(mutex_);
std::unique_lock<std::recursive_mutex> lock(mutex_);

if (poolConnections_) {
PoolMap::iterator cnxIt = pool_.find(logicalAddress);
if (cnxIt != pool_.end()) {
ClientConnectionPtr cnx = cnxIt->second.lock();
auto& cnx = cnxIt->second;

if (cnx && !cnx->isClosed()) {
if (!cnx->isClosed()) {
// Found a valid or pending connection in the pool
LOG_DEBUG("Got connection from pool for " << logicalAddress << " use_count: " //
<< (cnx.use_count() - 1) << " @ " << cnx.get());
<< (cnx.use_count()) << " @ " << cnx.get());
return cnx->getConnectFuture();
} else {
// Deleting stale connection
LOG_INFO("Deleting stale connection from pool for "
<< logicalAddress << " use_count: " << (cnx.use_count() - 1) << " @ " << cnx.get());
// The closed connection should have been removed from the pool in ClientConnection::close
LOG_WARN("Deleting stale connection from pool for "
<< logicalAddress << " use_count: " << (cnx.use_count()) << " @ " << cnx.get());
pool_.erase(logicalAddress);
}
}
Expand All @@ -94,7 +95,7 @@ Future<Result, ClientConnectionWeakPtr> ConnectionPool::getConnectionAsync(
ClientConnectionPtr cnx;
try {
cnx.reset(new ClientConnection(logicalAddress, physicalAddress, executorProvider_->get(),
clientConfiguration_, authentication_, clientVersion_));
clientConfiguration_, authentication_, clientVersion_, *this));
} catch (const std::runtime_error& e) {
lock.unlock();
LOG_ERROR("Failed to create connection: " << e.what())
Expand All @@ -114,4 +115,13 @@ Future<Result, ClientConnectionWeakPtr> ConnectionPool::getConnectionAsync(
return future;
}

void ConnectionPool::remove(const std::string& key, ClientConnection* value) {
std::lock_guard<std::recursive_mutex> lock(mutex_);
auto it = pool_.find(key);
if (it->second.get() == value) {
LOG_INFO("Remove connection for " << key);
pool_.erase(it);
}
}

} // namespace pulsar
6 changes: 4 additions & 2 deletions lib/ConnectionPool.h
Original file line number Diff line number Diff line change
Expand Up @@ -51,6 +51,8 @@ class PULSAR_PUBLIC ConnectionPool {
*/
bool close();

void remove(const std::string& key, ClientConnection* value);

/**
* Get a connection from the pool.
* <p>
Expand Down Expand Up @@ -78,11 +80,11 @@ class PULSAR_PUBLIC ConnectionPool {
ClientConfiguration clientConfiguration_;
ExecutorServiceProviderPtr executorProvider_;
AuthenticationPtr authentication_;
typedef std::map<std::string, ClientConnectionWeakPtr> PoolMap;
typedef std::map<std::string, std::shared_ptr<ClientConnection>> PoolMap;
PoolMap pool_;
bool poolConnections_;
const std::string clientVersion_;
mutable std::mutex mutex_;
mutable std::recursive_mutex mutex_;
std::atomic_bool closed_{false};

friend class PulsarFriend;
Expand Down
6 changes: 3 additions & 3 deletions lib/ConsumerImpl.cc
Original file line number Diff line number Diff line change
Expand Up @@ -315,13 +315,13 @@ void ConsumerImpl::handleCreateConsumer(const ClientConnectionPtr& cnx, Result r
if (consumerCreatedPromise_.isComplete()) {
// Consumer had already been initially created, we need to retry connecting in any case
LOG_WARN(getName() << "Failed to reconnect consumer: " << strResult(result));
scheduleReconnection(get_shared_this_ptr());
scheduleReconnection();
} else {
// Consumer was not yet created, retry to connect to broker if it's possible
result = convertToTimeoutIfNecessary(result, creationTimestamp_);
if (result == ResultRetryable) {
LOG_WARN(getName() << "Temporary error in creating consumer: " << strResult(result));
scheduleReconnection(get_shared_this_ptr());
scheduleReconnection();
} else {
LOG_ERROR(getName() << "Failed to create consumer: " << strResult(result));
consumerCreatedPromise_.setFailed(result);
Expand Down Expand Up @@ -1206,7 +1206,7 @@ void ConsumerImpl::negativeAcknowledge(const MessageId& messageId) {
void ConsumerImpl::disconnectConsumer() {
LOG_INFO("Broker notification of Closed consumer: " << consumerId_);
resetCnx();
scheduleReconnection(get_shared_this_ptr());
scheduleReconnection();
}

void ConsumerImpl::closeAsync(ResultCallback originalCallback) {
Expand Down
102 changes: 48 additions & 54 deletions lib/HandlerBase.cc
Original file line number Diff line number Diff line change
Expand Up @@ -79,98 +79,92 @@ void HandlerBase::grabCnx() {

LOG_INFO(getName() << "Getting connection from pool");
ClientImplPtr client = client_.lock();
Future<Result, ClientConnectionWeakPtr> future = client->getConnection(*topic_);
future.addListener(std::bind(&HandlerBase::handleNewConnection, std::placeholders::_1,
std::placeholders::_2, get_weak_from_this()));
}

void HandlerBase::handleNewConnection(Result result, ClientConnectionWeakPtr connection,
HandlerBaseWeakPtr weakHandler) {
HandlerBasePtr handler = weakHandler.lock();
if (!handler) {
LOG_DEBUG("HandlerBase Weak reference is not valid anymore");
if (!client) {
LOG_WARN(getName() << "Client is invalid when calling grabCnx()");
connectionFailed(ResultConnectError);
return;
}

handler->reconnectionPending_ = false;

if (result == ResultOk) {
ClientConnectionPtr conn = connection.lock();
if (conn) {
LOG_DEBUG(handler->getName() << "Connected to broker: " << conn->cnxString());
handler->connectionOpened(conn);
return;
}
// TODO - look deeper into why the connection is null while the result is ResultOk
LOG_INFO(handler->getName() << "ClientConnectionPtr is no longer valid");
}
handler->connectionFailed(result);
scheduleReconnection(handler);
auto weakSelf = get_weak_from_this();
client->getConnection(*topic_).addListener(
[this, weakSelf](Result result, const ClientConnectionPtr& cnx) {
auto self = weakSelf.lock();
if (!self) {
LOG_DEBUG("HandlerBase Weak reference is not valid anymore");
return;
}

reconnectionPending_ = false;

if (result == ResultOk) {
LOG_DEBUG(getName() << "Connected to broker: " << cnx->cnxString());
connectionOpened(cnx);
} else {
connectionFailed(result);
scheduleReconnection();
}
});
}

void HandlerBase::handleDisconnection(Result result, ClientConnectionWeakPtr connection,
HandlerBaseWeakPtr weakHandler) {
HandlerBasePtr handler = weakHandler.lock();
if (!handler) {
LOG_DEBUG("HandlerBase Weak reference is not valid anymore");
return;
}
void HandlerBase::handleDisconnection(Result result, const ClientConnectionPtr& cnx) {
State state = state_;

State state = handler->state_;

ClientConnectionPtr currentConnection = handler->getCnx().lock();
if (currentConnection && connection.lock().get() != currentConnection.get()) {
LOG_WARN(handler->getName()
<< "Ignoring connection closed since we are already attached to a newer connection");
ClientConnectionPtr currentConnection = getCnx().lock();
if (currentConnection && cnx.get() != currentConnection.get()) {
LOG_WARN(
getName() << "Ignoring connection closed since we are already attached to a newer connection");
return;
}

handler->resetCnx();
resetCnx();

if (result == ResultRetryable) {
scheduleReconnection(handler);
scheduleReconnection();
return;
}

switch (state) {
case Pending:
case Ready:
scheduleReconnection(handler);
scheduleReconnection();
break;

case NotStarted:
case Closing:
case Closed:
case Producer_Fenced:
case Failed:
LOG_DEBUG(handler->getName()
<< "Ignoring connection closed event since the handler is not used anymore");
LOG_DEBUG(getName() << "Ignoring connection closed event since the handler is not used anymore");
break;
}
}

void HandlerBase::scheduleReconnection(HandlerBasePtr handler) {
const auto state = handler->state_.load();
void HandlerBase::scheduleReconnection() {
const auto state = state_.load();

if (state == Pending || state == Ready) {
TimeDuration delay = handler->backoff_.next();
TimeDuration delay = backoff_.next();

LOG_INFO(handler->getName() << "Schedule reconnection in " << (delay.total_milliseconds() / 1000.0)
<< " s");
handler->timer_->expires_from_now(delay);
LOG_INFO(getName() << "Schedule reconnection in " << (delay.total_milliseconds() / 1000.0) << " s");
timer_->expires_from_now(delay);
// passing shared_ptr here since time_ will get destroyed, so tasks will be cancelled
// so we will not run into the case where grabCnx is invoked on out of scope handler
handler->timer_->async_wait(std::bind(&HandlerBase::handleTimeout, std::placeholders::_1, handler));
auto weakSelf = get_weak_from_this();
timer_->async_wait([weakSelf](const boost::system::error_code& ec) {
auto self = weakSelf.lock();
if (self) {
self->handleTimeout(ec);
}
});
}
}

void HandlerBase::handleTimeout(const boost::system::error_code& ec, HandlerBasePtr handler) {
void HandlerBase::handleTimeout(const boost::system::error_code& ec) {
if (ec) {
LOG_DEBUG(handler->getName() << "Ignoring timer cancelled event, code[" << ec << "]");
LOG_DEBUG(getName() << "Ignoring timer cancelled event, code[" << ec << "]");
return;
} else {
handler->epoch_++;
handler->grabCnx();
epoch_++;
grabCnx();
}
}

Expand Down
7 changes: 3 additions & 4 deletions lib/HandlerBase.h
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@ class HandlerBase {
/*
* Schedule reconnection after backoff time
*/
static void scheduleReconnection(HandlerBasePtr handler);
void scheduleReconnection();

/**
* Do some cleanup work before changing `connection_` to `cnx`.
Expand All @@ -89,10 +89,9 @@ class HandlerBase {
virtual const std::string& getName() const = 0;

private:
static void handleNewConnection(Result result, ClientConnectionWeakPtr connection, HandlerBaseWeakPtr wp);
static void handleDisconnection(Result result, ClientConnectionWeakPtr connection, HandlerBaseWeakPtr wp);
void handleDisconnection(Result result, const ClientConnectionPtr& cnx);

static void handleTimeout(const boost::system::error_code& ec, HandlerBasePtr handler);
void handleTimeout(const boost::system::error_code& ec);

protected:
ClientImplWeakPtr client_;
Expand Down
Loading

0 comments on commit b35ae1a

Please sign in to comment.