Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Collection of fixes for various shutdown scenario race conditions #266

Merged
merged 19 commits into from
Mar 10, 2020
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
19 commits
Select commit Hold shift + click to select a range
6d8e717
Collection of fixes for various shutdown scenario race conditions:
maxgolov Feb 3, 2020
fe20129
Fix FlushAndTeardown causing double decrement on PAL ref-count.
maxgolov Feb 6, 2020
d8e4a89
Merge branch 'master' into maxgolov/shutdown_fixes
maxgolov Feb 6, 2020
f3ba622
Merge Mobile Platforms support from branch 'master' of https://github…
maxgolov Mar 5, 2020
af12ea6
Merge branch 'maxgolov/shutdown_fixes' of https://github.com/Microsof…
maxgolov Mar 5, 2020
1d569fd
Addressing code review comments from Jason:
maxgolov Mar 5, 2020
c226aab
Update WorkerThread.cpp
maxgolov Mar 5, 2020
d0639f1
Update TransmissionPolicyManager.cpp
maxgolov Mar 5, 2020
be59a73
Rename m_backoff_lock to m_backoffMutex
maxgolov Mar 5, 2020
315a21f
Merge branch 'maxgolov/shutdown_fixes' of https://github.com/Microsof…
maxgolov Mar 5, 2020
49e5ee5
Merge branch 'master' into maxgolov/shutdown_fixes
maxgolov Mar 5, 2020
772b4ae
Merge branch 'master' into maxgolov/shutdown_fixes
maxgolov Mar 5, 2020
d1f9d00
Merge branch 'master' into maxgolov/shutdown_fixes
maxgolov Mar 5, 2020
91367b0
Update ITaskDispatcher.hpp
maxgolov Mar 5, 2020
38b22f1
Merge branch 'master' into maxgolov/shutdown_fixes
maxgolov Mar 5, 2020
2934cd2
Addressing code review concerns about a few possible race conditions
maxgolov Mar 9, 2020
bc16fb6
Address code review concern
maxgolov Mar 9, 2020
5f7013d
Update lib/http/HttpClient_WinInet.cpp
maxgolov Mar 10, 2020
75646f3
Update lib/http/HttpClient_WinInet.cpp
maxgolov Mar 10, 2020
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions lib/http/HttpClientManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,7 @@ namespace ARIASDK_NS_BEGIN {
PAL::scheduleTask(&m_taskDispatcher, 0, this, &HttpClientManager::onHttpResponse, callback);
}

/* This method may get executed synchronously on Windows from handleSendRequest in case of connection failure */
void HttpClientManager::onHttpResponse(HttpCallback* callback)
{
EventsUploadContextPtr &ctx = callback->m_ctx;
Expand Down
2 changes: 1 addition & 1 deletion lib/http/HttpClientManager.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ class HttpClientManager
ILogManager& m_logManager;
IHttpClient& m_httpClient;
ITaskDispatcher& m_taskDispatcher;
std::mutex m_httpCallbacksMtx;
std::recursive_mutex m_httpCallbacksMtx;
std::list<HttpCallback*> m_httpCallbacks;
};

Expand Down
74 changes: 60 additions & 14 deletions lib/http/HttpClient_WinInet.cpp
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
// clang-format off
// Copyright (c) Microsoft. All rights reserved.
#include "mat/config.h"

Expand All @@ -11,6 +12,7 @@

#include <algorithm>
#include <memory>
#include <mutex>
#include <sstream>
#include <vector>
#include <oacr.h>
Expand All @@ -22,17 +24,21 @@ class WinInetRequestWrapper
protected:
HttpClient_WinInet& m_parent;
std::string m_id;
IHttpResponseCallback* m_appCallback {};
HINTERNET m_hWinInetSession {};
HINTERNET m_hWinInetRequest {};
IHttpResponseCallback* m_appCallback{ nullptr };
HINTERNET m_hWinInetSession { nullptr };
HINTERNET m_hWinInetRequest { nullptr };
SimpleHttpRequest* m_request;
BYTE m_buffer[1024] {};
std::recursive_mutex m_connectingMutex;
bool isAborted { false };
bool isCallbackCalled { false };

public:
WinInetRequestWrapper(HttpClient_WinInet& parent, SimpleHttpRequest* request)
: m_parent(parent),
m_request(request),
m_id(request->GetId())
m_id(request->GetId()),
m_hWinInetRequest(nullptr),
m_request(request)
{
LOG_TRACE("%p WinInetRequestWrapper()", this);
}
Expand All @@ -55,10 +61,11 @@ class WinInetRequestWrapper
*/
void cancel()
{
LOCKGUARD(m_connectingMutex);
if (m_hWinInetRequest != nullptr)
{
isAborted = true;
::InternetCloseHandle(m_hWinInetRequest);
// don't wait for request callback
}
}

Expand Down Expand Up @@ -119,6 +126,17 @@ class WinInetRequestWrapper
m_parent.m_requests[m_id] = this;
}

m_connectingMutex.lock();
maxgolov marked this conversation as resolved.
Show resolved Hide resolved
if (isAborted)
{
// Request aborted before being created
m_connectingMutex.unlock();
onRequestComplete(ERROR_INTERNET_OPERATION_CANCELLED);
// Note that at this point the object just self-destroyed.
return;
}
m_connectingMutex.unlock();

URL_COMPONENTSA urlc;
memset(&urlc, 0, sizeof(urlc));
urlc.dwStructSize = sizeof(urlc);
Expand All @@ -131,7 +149,7 @@ class WinInetRequestWrapper
if (!::InternetCrackUrlA(m_request->m_url.data(), (DWORD)m_request->m_url.size(), 0, &urlc)) {
DWORD dwError = ::GetLastError();
LOG_WARN("InternetCrackUrl() failed: dwError=%d url=%s", dwError, m_request->m_url.data());
onRequestComplete(dwError);
onRequestComplete(ERROR_INTERNET_OPERATION_CANCELLED);
return;
}

Expand All @@ -140,10 +158,11 @@ class WinInetRequestWrapper
if (m_hWinInetSession == NULL) {
DWORD dwError = ::GetLastError();
LOG_WARN("InternetConnect() failed: %d", dwError);
onRequestComplete(dwError);
onRequestComplete(ERROR_INTERNET_OPERATION_CANCELLED);
return;
}
// TODO: Session handle for the same target should be cached across requests to enable keep-alive.
DispatchEvent(OnConnecting);

PCSTR szAcceptTypes[] = {"*/*", NULL};
m_hWinInetRequest = ::HttpOpenRequestA(
Expand All @@ -155,9 +174,10 @@ class WinInetRequestWrapper
if (m_hWinInetRequest == NULL) {
DWORD dwError = ::GetLastError();
LOG_WARN("HttpOpenRequest() failed: %d", dwError);
onRequestComplete(dwError);
onRequestComplete(ERROR_INTERNET_OPERATION_CANCELLED);
return;
}
DispatchEvent(OnCreated);

/* Perform optional MS Root certificate check for certain end-point URLs */
if (m_parent.IsMsRootCheckRequired())
Expand All @@ -175,7 +195,13 @@ class WinInetRequestWrapper
for (auto const& header : m_request->m_headers) {
os << header.first << ": " << header.second << "\r\n";
}
::HttpAddRequestHeadersA(m_hWinInetRequest, os.str().data(), static_cast<DWORD>(os.tellp()), HTTP_ADDREQ_FLAG_ADD | HTTP_ADDREQ_FLAG_REPLACE);
if (!::HttpAddRequestHeadersA(m_hWinInetRequest, os.str().data(), static_cast<DWORD>(os.tellp()), HTTP_ADDREQ_FLAG_ADD | HTTP_ADDREQ_FLAG_REPLACE))
{
DWORD dwError = ::GetLastError();
LOG_WARN("HttpAddRequestHeadersA() failed: %d", dwError);
onRequestComplete(ERROR_INTERNET_OPERATION_CANCELLED);
return;
}

void *data = static_cast<void *>(m_request->m_body.data());
DWORD size = static_cast<DWORD>(m_request->m_body.size());
Expand All @@ -185,7 +211,8 @@ class WinInetRequestWrapper
if (bResult == TRUE && dwError != ERROR_IO_PENDING) {
dwError = ::GetLastError();
LOG_WARN("HttpSendRequest() failed: %d", dwError);
onRequestComplete(dwError);
onRequestComplete(ERROR_INTERNET_OPERATION_CANCELLED);
return;
}
}

Expand All @@ -207,6 +234,9 @@ class WinInetRequestWrapper
return;
}

case INTERNET_STATUS_HANDLE_CLOSING:
break;
maxgolov marked this conversation as resolved.
Show resolved Hide resolved

case INTERNET_STATUS_REQUEST_COMPLETE: {
assert(dwStatusInformationLength >= sizeof(INTERNET_ASYNC_RESULT));
INTERNET_ASYNC_RESULT& result = *static_cast<INTERNET_ASYNC_RESULT*>(lpvStatusInformation);
Expand All @@ -222,6 +252,14 @@ class WinInetRequestWrapper
}
}

void DispatchEvent(HttpStateEvent type)
{
if (m_appCallback != nullptr)
{
m_appCallback->OnHttpStateEvent(type, static_cast<void*>(m_hWinInetRequest), 0);
}
}

void onRequestComplete(DWORD dwError)
{
std::unique_ptr<SimpleHttpResponse> response(new SimpleHttpResponse(m_id));
Expand Down Expand Up @@ -348,9 +386,16 @@ class WinInetRequestWrapper
}
}

// 'response' gets released in EventsUploadContext.clear()
m_appCallback->OnHttpResponse(response.release());
m_parent.erase(m_id);
assert(isCallbackCalled==false);
maxgolov marked this conversation as resolved.
Show resolved Hide resolved
if (!isCallbackCalled)
{
// 'response' gets released in EventsUploadContext.clear()
::InternetSetStatusCallback(m_hWinInetRequest, nullptr);
isCallbackCalled = true;
m_appCallback->OnHttpResponse(response.release());
// parent is destroying the request object by id
m_parent.erase(m_id);
}
}
};

Expand Down Expand Up @@ -454,3 +499,4 @@ bool HttpClient_WinInet::IsMsRootCheckRequired()
} ARIASDK_NS_END

#endif // HAVE_MAT_DEFAULT_HTTP_CLIENT
// clang-format on
9 changes: 7 additions & 2 deletions lib/include/public/ITaskDispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,11 @@ namespace ARIASDK_NS_BEGIN
/// <summary>
/// A Done item is an item that has been marked by the worker thread as already completed.
/// </summary>
Done
Done,
/// <summary>
/// A Cancelled item is an item that has been marked by the worker thread as Cancelled.
/// </summary>
Cancelled
} Type;

/// <summary>
Expand Down Expand Up @@ -92,8 +96,9 @@ namespace ARIASDK_NS_BEGIN
/// Cancel a previously queued tasks
/// </summary>
/// <param name="task">Task to be cancelled</param>
/// <param name="waitTime">Amount of time to wait for if the task is currently executing</param>
/// <returns>True if successfully cancelled, else false</returns>
virtual bool Cancel(Task* task) = 0;
virtual bool Cancel(Task* task, uint64_t waitTime = 0) = 0;
};

/// @endcond
Expand Down
1 change: 1 addition & 0 deletions lib/include/public/LogManagerBase.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,7 @@ namespace ARIASDK_NS_BEGIN
/// </summary>
static status_t UploadNow()
{
LM_LOCKGUARD(stateLock());
maxgolov marked this conversation as resolved.
Show resolved Hide resolved
if (isHost())
LM_SAFE_CALL(GetLogController()->UploadNow);
return STATUS_EPERM; // Permission denied
Expand Down
8 changes: 4 additions & 4 deletions lib/include/public/Version.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
#define MAT_VERSION_HPP
// WARNING: DO NOT MODIFY THIS FILE!
// This file has been automatically generated, manual changes will be lost.
#define BUILD_VERSION_STR "3.3.8.1"
#define BUILD_VERSION 3,3,8,1
#define BUILD_VERSION_STR "3.3.29.1"
#define BUILD_VERSION 3,3,29,1

#ifndef RESOURCE_COMPILER_INVOKED
#include <stdint.h>
Expand All @@ -30,8 +30,8 @@ namespace ARIASDK_NS_BEGIN {
uint64_t const Version =
((uint64_t)3 << 48) |
((uint64_t)3 << 32) |
((uint64_t)8 << 16) |
((uint64_t)1);
((uint64_t)29 << 16) |
((uint64_t)1);

} ARIASDK_NS_END

Expand Down
2 changes: 1 addition & 1 deletion lib/modules
4 changes: 2 additions & 2 deletions lib/pal/TaskDispatcher.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,11 +68,11 @@ namespace PAL_NS_BEGIN {
m_task(h.m_task),
m_taskDispatcher(h.m_taskDispatcher) { };

bool Cancel()
bool Cancel(uint64_t waitTime = 0)
{
if (m_task)
{
bool result = (m_taskDispatcher != nullptr) && (m_taskDispatcher->Cancel(m_task));
bool result = (m_taskDispatcher != nullptr) && (m_taskDispatcher->Cancel(m_task, waitTime));
m_task = nullptr;
m_taskDispatcher = nullptr;
return result;
Expand Down
3 changes: 2 additions & 1 deletion lib/pal/TaskDispatcher_CAPI.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -125,7 +125,8 @@ namespace PAL_NS_BEGIN {
m_queueFn(&capiTask, &OnAsyncTaskCallback);
}

bool TaskDispatcher_CAPI::Cancel(Task* task)
// TODO: currently shutdown wait on task cancellation is not implemented for C API Task Dispatcher
bool TaskDispatcher_CAPI::Cancel(Task* task, uint64_t)
{
std::string taskId;

Expand Down
2 changes: 1 addition & 1 deletion lib/pal/TaskDispatcher_CAPI.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ namespace PAL_NS_BEGIN {
TaskDispatcher_CAPI(task_dispatcher_queue_fn_t queueFn, task_dispatcher_cancel_fn_t cancelFn, task_dispatcher_join_fn_t joinFn);
void Join() override;
void Queue(MAT::Task* task) override;
bool Cancel(MAT::Task* task) override;
bool Cancel(MAT::Task* task, uint64_t waitTime = 0) override;

private:
task_dispatcher_queue_fn_t m_queueFn;
Expand Down
23 changes: 21 additions & 2 deletions lib/pal/WorkerThread.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,9 @@
/* Maximum scheduler interval for SDK is 1 hour required for clamping in case of monotonic clock drift */
#define MAX_FUTURE_DELTA_MS (60 * 60 * 1000)

/* Polling interval for task cancellation */
#define TASK_CANCEL_WAIT_MS 100
kindbe marked this conversation as resolved.
Show resolved Hide resolved
maxgolov marked this conversation as resolved.
Show resolved Hide resolved

namespace PAL_NS_BEGIN {

class WorkerThreadShutdownItem : public Task
Expand Down Expand Up @@ -88,13 +91,29 @@ namespace PAL_NS_BEGIN {
m_event.post();
}

bool Cancel(MAT::Task* item) override
bool Cancel(MAT::Task* item, uint64_t waitTime) override
{
if ((m_itemInProgress == item)||(item==nullptr))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

m_itemInProgress [](start = 17, length = 16)

Shouldn't m_itemInProgress be protected by a mutex, since it gets set on the worker thread, but gets checked in Cancel?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

My understanding of the logic, is that it should be something like:

  • Acquire lock and check if task is in progress
  • If task is in progress, release lock and wait for task completion. Re-acquire lock to check the task progress
  • If task isn't in progress, will need to maintain the lock while clearing it from the queue

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'll write a separate note on why it's fine :) I was again thinking that you may be right, as it looks racy, but in fact it's not. I'd like to avoid unnecessarily locking here since it might impact perf within the worker thread queue. Deletion of items is done atomically.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think you're right --- there was a corner case where we did not wait if item is already popped for execution, but m_itemInProgress hasn't been assigned yet. I made this part of code thread-safe in the next commit.

if (item == nullptr)
{
return false;
}

if (m_itemInProgress == item)
{
/* Can't recursively wait on completion of our own thread */
if (m_hThread.get_id() != std::this_thread::get_id())
{
LOCKGUARD(m_lock);
Copy link
Contributor

@bliptec bliptec Mar 7, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this lockguard here? It makes me nervous to hold a lock while sleeping. Won't this also prevent the workerthread from getting the shutdown item off the queue until this completes (probably not a big deal, but a little scary in case that code changes)

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

At first I thought that you're correct. But on a second thought, in fact, I think it's probably a good thing. Because it totally eliminates my other concern about another task assigned with the same raw ptr:

Task is already executing in another thread here:

    self->m_itemInProgress = item.get();
    (*item)();
    self->m_itemInProgress = nullptr;

That section of code does not hold the lock. When we're done, we assigned nullptr to the item in progress. Cool. That unblocks the waiter.

Since our idle-waiter holds the lock - we prevent the worker thread to schedule a new item, because in order to schedule another item - it has to acquire the lock. This will cause up to 50ms hiccup to the scheduler, but since cancellation itself is a very infrequent operation and the worker is in a background thread - it won't affect performance in any way.

Since I hope we never cancel the shutdown item :D :D :D ... we should be fine. Yeah, it'd prevent the shutdown item to get scheduled if we're waiting on a cancellation of a current task preceding the shutdown item.

Let me know if it makes sense. If it doesn't, please let me know if I'm mistaken. I'm honestly willing to adjust this if this doesn't seem right to you.

Copy link
Contributor Author

@maxgolov maxgolov Mar 9, 2020

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I reworked this slightly, but the comment remains the same - I kept the lock for the duration of idle-wait, to prevent another item from being scheduled. That is fully addressing my previous reliability concern - what if a new task gets allocated with the same thread ptr. Now we won't run into that condition at all. This may result in a hiccup of up to 50ms for the worker thread scheduling, but only in case when we're currently executing work item that's being cancelled. In other cases we do not block. My logical reasoning here:

  • cancellation is not a frequent operation
  • cancellation of a currently executing item is even less frequent operation.
  • background worker tasks get generally scheduled at 1s intervals (min granularity interval for the upload scheduler), and we are not providing any realtime nor latency guarantees in there. Even the UploadNow API itself is merely a hint, not a guarantee.

Thus, this hypothetical 50ms 'excessive' wait on task cancellation is not impacting perf in any measurable way.

while ((waitTime > TASK_CANCEL_WAIT_MS) && (m_itemInProgress == item))
{
PAL::sleep(TASK_CANCEL_WAIT_MS);
waitTime -= TASK_CANCEL_WAIT_MS;
}
/* Either waited long enough or the task is still executing */
}
return (m_itemInProgress == item);
maxgolov marked this conversation as resolved.
Show resolved Hide resolved
}

{
LOCKGUARD(m_lock);
auto it = std::find(m_timerQueue.begin(), m_timerQueue.end(), item);
Expand Down
2 changes: 1 addition & 1 deletion lib/pal/desktop/desktop.vcxitems.filters
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
<ClCompile Include="$(MSBuildThisFileDirectory)NetworkDetector.cpp" />
<ClCompile Include="$(MSBuildThisFileDirectory)..\..\http\HttpClient_WinInet.cpp" />
</ItemGroup>
<ItemGroup Condition="exists('$(MSBuildThisFileDirectory)..\..\modules\')">
<ItemGroup Condition="exists('$(MSBuildThisFileDirectory)..\..\modules\utc\')">
<ClCompile Include="$(MSBuildThisFileDirectory)..\..\modules\utc\desktop\UtcHelpers.cpp" />
<ClCompile Include="$(MSBuildThisFileDirectory)..\..\modules\utc\UtcTelemetrySystem.cpp" />
</ItemGroup>
Expand Down
2 changes: 1 addition & 1 deletion lib/pal/universal/universal.vcxitems.filters
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
<ClInclude Include="$(MSBuildThisFileDirectory)..\..\http\HttpClient_WinRt.hpp" />
<ClInclude Include="$(MSBuildThisFileDirectory)pch.h" />
</ItemGroup>
<ItemGroup Condition="exists('$(MSBuildThisFileDirectory)..\..\modules\')">
<ItemGroup Condition="exists('$(MSBuildThisFileDirectory)..\..\modules\utc\')">
<ClCompile Include="$(MSBuildThisFileDirectory)..\..\modules\utc\universal\UtcHelpers.cpp" />
<ClCompile Include="$(MSBuildThisFileDirectory)..\..\modules\utc\UtcTelemetrySystem.cpp" />
</ItemGroup>
Expand Down
Loading