Cleanup

Expensify · iwiznia · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024 · Dec 10, 2024
commit 5d29d00945558a0bff37842bc6cceb217d5f2b77
diff --git a/sqlitecluster/SQLiteNode.cpp b/sqlitecluster/SQLiteNode.cpp
@@ -187,10 +187,6 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
     }
     _replicateStartCV.notify_all();
 
-    if (_replicationThreadsShouldExit) {
-        SINFO("Late replicate start, just exiting.");
-    }
-
     // Initialize each new thread with a new number.
     SInitialize("replicate" + to_string(currentReplicateThreadID.fetch_add(1)));
 
@@ -199,19 +195,6 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
 
     // Allow the DB handle to be returned regardless of how this function exits.
     SQLiteScopedHandle dbScope(*_dbPool, sqlitePoolIndex);
-    // In dev, we sometimes crash on the destructor for the above after `detach` which implies to me that the DB Pool could have been deleted before we exited.
-    // This is a different manifestation of what could be the same issue.
-    // IN dev, we get:
-    // 2024-12-09T21:12:45.030255+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:215) _replicate [replicate2065] [info] {cluster_node_4/SEARCHING} [performance] BEGIN_TRANSACTION replicate thread for commit 15003 waiting on DB count 15001 (ASYNC)
-    // 2024-12-09T21:12:49.023666+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:1913) _changeState [sync] [info] {cluster_node_4/FOLLOWING} Replication threads should exit, canceling commits after current leader commit 0
-    // 2024-12-09T21:12:49.030957+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteSequentialNotifier.cpp:57) waitFor [replicate2065] [info] Hit 1s timeout while global cancel 1 or  specific cancel 0
-    // 2024-12-09T21:12:49.030963+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:225) _replicate [replicate2065] [info] {cluster_node_4/FOLLOWING} _localCommitNotifier.waitFor canceled early, returning.
-    // 2024-12-09T21:12:49.030977+00:00 expensidev2004 bedrock10013: xxxxxx (SSignal.cpp:193) _SSignal_StackTrace [replicate2065] [warn] Signal Segmentation fault(11) caused crash, logging stack trace.
-    // Why doesn't this get cancelled as well?
-    // I'm not sure why dev thinks leader has commit 0.
-    // 2024-12-09T21:12:49.023613+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteNode.cpp:1902) _changeState [sync] [info] {cluster_node_4/FOLLOWING} [NOTIFY] setting commit count to: 17001
-    // 2024-12-09T21:12:49.023674+00:00 expensidev2004 bedrock10013: xxxxxx (SQLiteSequentialNotifier.cpp:105) cancel [sync] [info] Canceling all pending transactions after 0
-
     SQLite& db = dbScope.db();
 
     bool goSearchingOnExit = false;
@@ -231,22 +214,12 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
             uint64_t waitForCount = SStartsWith(command["ID"], "ASYNC") ? command.calcU64("dbCountAtStart") : currentCount;
             SINFO("[performance] BEGIN_TRANSACTION replicate thread for commit " << newCount << " waiting on DB count " << waitForCount << " (" << (quorum ? "QUORUM" : "ASYNC") << ")");
             while (true) {
-                // Ok, why doesn't this get counted?
-                // It's waiting on: commit 26056807239 waiting on DB count 26056807238
-                // It seems like this should return immediately.
-                // Ok, is it possible we got past here and waited somewhere else? We were either stuck here, or...
                 SQLiteSequentialNotifier::RESULT result = _localCommitNotifier.waitFor(waitForCount, false);
-                // My current inclination is that maybe we reset the commit notifier before the thread really starts.
-
-                // I think we get stuck in `waitFor` because it's been reset to 0 and we're waiting for every commit from the
-                // Beginning of time. I'm not acutally sure why it ever returns, though.
-
                 if (result == SQLiteSequentialNotifier::RESULT::UNKNOWN) {
                     // This should be impossible.
                     SERROR("Got UNKNOWN result from waitFor, which shouldn't happen");
                 } else if (result == SQLiteSequentialNotifier::RESULT::COMPLETED) {
                     // Success case.
-                    // If we didn't get stuck above, we must have hit here, because otherwise we would have logged.
                     break;
                 } else if (result == SQLiteSequentialNotifier::RESULT::CANCELED) {
                     SINFO("_localCommitNotifier.waitFor canceled early, returning.");
@@ -264,7 +237,6 @@ void SQLiteNode::_replicate(SQLitePeer* peer, SData command, size_t sqlitePoolIn
                     if (commitAttemptCount > 1) {
                         SINFO("Commit attempt number " << commitAttemptCount << " for concurrent replication.");
                     }
-                    // We never log this line, so we can't have gotten to here.
                     SINFO("[performance] BEGIN for commit " << newCount);
                     bool uniqueContraintsError = false;
                     try {
@@ -1679,19 +1651,11 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
             if (_replicationThreadsShouldExit) {
                 SINFO("Discarding replication message, stopping FOLLOWING");
             } else {
-                // Ok, so the race condition could be here, right?
-                // Right this instance, the thread count is 0, so we can move past the check that
-                // Waits for it to be 0.
-                // But then this thread starts. Can that happen?
-                // So the sync thread does the state switch.
-                // Who is running this though, it should also be the sync thread?
-                // I don't see how anyone else would be able to do this.
                 auto threadID = _replicationThreadCount.fetch_add(1);
                 SDEBUG("Spawning concurrent replicate thread (blocks until DB handle available): " << threadID);
                 try {
                     uint64_t threadAttemptStartTimestamp = STimeNow();
                     _replicateThreadStarted = false;
-                    // Either here.
                     thread(&SQLiteNode::_replicate, this, peer, message, _dbPool->getIndex(false), threadAttemptStartTimestamp).detach();
                     {
                         unique_lock<mutex> lock(_replicateStartMutex);
@@ -1708,6 +1672,7 @@ void SQLiteNode::_onMESSAGE(SQLitePeer* peer, const SData& message) {
                     // and waiting for the transaction that failed will be stuck in an infinite loop. To prevent that
                     // we're changing the state to SEARCHING and sending the cancelAfter property to drop all threads
                     // that depend on the transaction that failed to be threaded.
+                    _replicationThreadCount.fetch_sub(1);
                     SWARN("Caught system_error starting _replicate thread with " << _replicationThreadCount.load() << " threads. e.what()=" << e.what());
                     _changeState(SQLiteNodeState::SEARCHING, message.calcU64("NewCount") - 1);
                     STHROW("Error starting replicate thread so giving up and reconnecting.");
@@ -1960,23 +1925,21 @@ void SQLiteNode::_changeState(SQLiteNodeState newState, uint64_t commitIDToCance
             // Polling wait for threads to quit. This could use a notification model such as with a condition_variable,
             // which would probably be "better" but introduces yet more state variables for a state that we're rarely
             // in, and so I've left it out for the time being.
+            size_t infoCount = 1;
             while (_replicationThreadCount) {
-                SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
+                if (infoCount % 100 == 0) {
+                    SINFO("Waiting for " << _replicationThreadCount << " remaining replication threads.");
+                }
                 usleep(10'000);
+                infoCount++;
             }
-            // How can the above fail????
-            // We only increment _replicationThreadCount in the sync thread and we are reading it here in the sync thread.
-            // It is feasible to call `_changeState` from another thread but that's not what's happening in the issue we're seeing.
 
             // Done exiting. Reset so that we can resume FOLLOWING in the future.
             _replicationThreadsShouldExit = false;
 
             // Guaranteed to be done right now.
-            // I bet this is wrong when these get reset. If we no threads, these get reset before they check?
-            // That doesn't make sense to me for when we should increment _replicationThreadCount
             _localCommitNotifier.reset();
             _leaderCommitNotifier.reset();
-            // If the above completed, we should immediately see `Switching from... ` logged.
 
             // We have no leader anymore.
             _leadPeer = nullptr;

diff --git a/sqlitecluster/SQLiteNode.h b/sqlitecluster/SQLiteNode.h
@@ -198,8 +198,7 @@ class SQLiteNode : public STCPManager {
       public:
         ScopedDecrement(CounterType& counter) : _counter(counter) {}
         ~ScopedDecrement() {
-            auto decrementedTo = --_counter;
-            SINFO("TYLER _counter decremented to : " << decrementedTo);
+            --_counter;
         }
       private:
         CounterType& _counter;

diff --git a/sqlitecluster/SQLiteSequentialNotifier.cpp b/sqlitecluster/SQLiteSequentialNotifier.cpp
@@ -31,8 +31,6 @@ SQLiteSequentialNotifier::RESULT SQLiteSequentialNotifier::waitFor(uint64_t valu
                 SINFO("Canceled after " << _cancelAfter << ", but waiting for " << value << " so not returning yet.");
             } else {
                 // Canceled and we're not before the cancellation cutoff.
-                // I don't see how we don't return here. Maybe we never acquire `waitingThreadMutex`?
-                SINFO("Returning canceled because _cancelAfter=" << _cancelAfter << " and value=" << value);
                 return RESULT::CANCELED;
             }
         } else if (_globalResult != RESULT::UNKNOWN) {
@@ -54,10 +52,9 @@ SQLiteSequentialNotifier::RESULT SQLiteSequentialNotifier::waitFor(uint64_t valu
             // We should investigate any instances of thew below logline to see if they're same as for the success cases mentioned above (i.e., the timeout happens simultaneously as the
             // cancellation) or if the log line is delayed by up to a second (indicating a problem).
             if (_globalResult == RESULT::CANCELED || state->result == RESULT::CANCELED) {
-                // I bet removing a 1 second delay means we don't hit this. we get to calling `reset` on this sooner, possibly while there are threads waiting here.
-
                 // It's possible that we hit the timeout here after `cancel()` has set the global value, but before we received the notification.
                 // This isn't a problem, and we can jump back to the top of the loop and check again. If there's some problem, we'll see it there.
+                // Does this still happen?? Might be fixed.
                 SINFO("Hit 1s timeout while global cancel " << (_globalResult == RESULT::CANCELED) << " or " << " specific cancel " << (state->result == RESULT::CANCELED));
                 continue;
             }
@@ -120,10 +117,7 @@ void SQLiteSequentialNotifier::cancel(uint64_t cancelAfter) {
         auto start = _cancelAfter ? valueThreadMap.upper_bound(_cancelAfter) : valueThreadMap.begin();
         if (start == valueThreadMap.end()) {
             // There's nothing to remove.
-            SINFO("[performance] Next value to cancel after " << cancelAfter << " is N/A");
             return;
-        } else {
-            SINFO("[performance] Next value to cancel after " << cancelAfter << " is " << start->first);
         }
 
         // Now iterate across whatever's remaining and mark it canceled.