Skip to content

Commit e2bbcd2

Browse files
committed
Fix plugin parent_select failover
1 parent e29827f commit e2bbcd2

File tree

2 files changed

+40
-126
lines changed

2 files changed

+40
-126
lines changed

plugins/experimental/parent_select/parent_select.cc

Lines changed: 23 additions & 115 deletions
Original file line numberDiff line numberDiff line change
@@ -47,65 +47,9 @@ struct StrategyTxn {
4747
TSNextHopSelectionStrategy *strategy;
4848
void *txn; // void* because the actual type will depend on the strategy.
4949
int request_count;
50-
const char *prev_host; // the actually tried host, used when send_request sets the response_action to be the next thing to try.
51-
size_t prev_host_len;
52-
in_port_t prev_port;
53-
bool prev_is_retry;
54-
bool prev_no_cache;
50+
TSResponseAction prev_ra;
5551
};
5652

57-
int
58-
handle_send_request(TSHttpTxn txnp, StrategyTxn *strategyTxn)
59-
{
60-
TSDebug(PLUGIN_NAME, "handle_send_request calling");
61-
TSDebug(PLUGIN_NAME, "handle_send_request got strategy '%s'", strategyTxn->strategy->name());
62-
63-
auto strategy = strategyTxn->strategy;
64-
65-
// if (strategyTxn->retry_attempts == 0) {
66-
// // just did a DoRemap, which means we need to set the response_action of what to do in the event of failure
67-
// // because a failure might not call read_response (e.g. dns failure)
68-
// strategyTxn->retry_attempts = 1;
69-
// TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
70-
// return TS_SUCCESS;
71-
// }
72-
73-
// before sending a req, we need to set what to do on failure.
74-
// Because some failures don't call handle_response before getting to HttpTransact::HandleResponse
75-
// (e.g. connection failures)
76-
77-
TSResponseAction ra;
78-
TSHttpTxnResponseActionGet(txnp, &ra);
79-
80-
TSDebug(PLUGIN_NAME, "handle_send_request setting prev %.*s:%d", int(ra.hostname_len), ra.hostname, ra.port);
81-
strategyTxn->prev_host = ra.hostname;
82-
strategyTxn->prev_host_len = ra.hostname_len;
83-
strategyTxn->prev_port = ra.port;
84-
strategyTxn->prev_is_retry = ra.is_retry;
85-
strategyTxn->prev_no_cache = ra.no_cache;
86-
87-
strategy->next(txnp, strategyTxn->txn, ra.hostname, ra.hostname_len, ra.port, &ra.hostname, &ra.hostname_len, &ra.port,
88-
&ra.is_retry, &ra.no_cache);
89-
90-
ra.nextHopExists = ra.hostname_len != 0;
91-
ra.fail = !ra.nextHopExists; // failed is whether to fail and return to the client. failed=false means to retry the parent we set
92-
// in the response_action
93-
94-
// we don't know if it's retryable yet, because we don't have a status. So set it retryable if we have something which could be
95-
// retried. We'll set it retryable per the status in handle_response, and os_dns (which is called on connection failures, and
96-
// always retryable [notwithstanding num_retries]).
97-
ra.responseIsRetryable = ra.nextHopExists;
98-
ra.goDirect = strategy->goDirect();
99-
ra.parentIsProxy = strategy->parentIsProxy();
100-
101-
TSDebug(PLUGIN_NAME, "handle_send_request setting response_action hostname '%.*s' port %d direct %d proxy %d",
102-
int(ra.hostname_len), ra.hostname, ra.port, ra.goDirect, ra.parentIsProxy);
103-
TSHttpTxnResponseActionSet(txnp, &ra);
104-
105-
TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
106-
return TS_SUCCESS;
107-
}
108-
10953
// mark parents up or down, on failure or successful retry.
11054
void
11155
mark_response(TSHttpTxn txnp, StrategyTxn *strategyTxn, TSHttpStatus status)
@@ -118,17 +62,14 @@ mark_response(TSHttpTxn txnp, StrategyTxn *strategyTxn, TSHttpStatus status)
11862

11963
TSResponseAction ra;
12064
// if the prev_host isn't null, then that was the actual host we tried which needs to be marked down.
121-
if (strategyTxn->prev_host != nullptr) {
122-
ra.hostname = strategyTxn->prev_host;
123-
ra.hostname_len = strategyTxn->prev_host_len;
124-
ra.port = strategyTxn->prev_port;
125-
ra.is_retry = strategyTxn->prev_is_retry;
126-
ra.no_cache = strategyTxn->prev_no_cache;
65+
if (strategyTxn->prev_ra.hostname_len != 0) {
66+
ra = strategyTxn->prev_ra;
12767
TSDebug(PLUGIN_NAME, "mark_response using prev %.*s:%d", int(ra.hostname_len), ra.hostname, ra.port);
12868
} else {
12969
TSHttpTxnResponseActionGet(txnp, &ra);
13070
TSDebug(PLUGIN_NAME, "mark_response using response_action %.*s:%d", int(ra.hostname_len), ra.hostname, ra.port);
13171
}
72+
13273
if (isFailure && strategy->onFailureMarkParentDown(status)) {
13374
if (ra.hostname == nullptr) {
13475
TSError(
@@ -159,13 +100,6 @@ handle_read_response(TSHttpTxn txnp, StrategyTxn *strategyTxn)
159100

160101
TSDebug(PLUGIN_NAME, "handle_read_response got strategy '%s'", strategy->name());
161102

162-
// increment request count here, not send_request, because we need to consistently increase with os_dns hooks.
163-
// if we incremented the req count in send_request and not here, that would never be called on DNS failures, but DNS successes
164-
// would call os_dns and also send_request, resulting in dns failures incrementing the count by 1, and dns successes but http
165-
// failures would increment by 2. And successes would increment by 2. Hence, the only consistent way to count requests is on
166-
// read_response and os_dns, and not send_request.
167-
++strategyTxn->request_count;
168-
169103
TSMBuffer resp;
170104
TSMLoc resp_hdr;
171105
if (TS_SUCCESS != TSHttpTxnServerRespGet(txnp, &resp, &resp_hdr)) {
@@ -193,18 +127,15 @@ handle_read_response(TSHttpTxn txnp, StrategyTxn *strategyTxn)
193127
// Status.
194128
TSResponseAction ra;
195129
TSHttpTxnResponseActionGet(txnp, &ra);
196-
ra.responseIsRetryable = strategy->responseIsRetryable(strategyTxn->request_count, status);
130+
ra.responseIsRetryable = strategy->responseIsRetryable(strategyTxn->request_count - 1, status);
197131
TSHttpTxnResponseActionSet(txnp, &ra);
198132
}
199133

200134
// un-set the "prev" hackery. That only exists for markdown, which we just did.
201135
// The response_action is now the next thing to try, if this was a failure,
202136
// and should now be considered authoritative for everything.
203-
strategyTxn->prev_host = nullptr;
204-
strategyTxn->prev_host_len = 0;
205-
strategyTxn->prev_port = 0;
206-
strategyTxn->prev_is_retry = false;
207-
strategyTxn->prev_no_cache = false;
137+
138+
memset(&strategyTxn->prev_ra, 0, sizeof(TSResponseAction));
208139

209140
TSHandleMLocRelease(resp, TS_NULL_MLOC, resp_hdr);
210141
TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
@@ -216,48 +147,37 @@ handle_os_dns(TSHttpTxn txnp, StrategyTxn *strategyTxn)
216147
{
217148
TSDebug(PLUGIN_NAME, "handle_os_dns calling");
218149

219-
++strategyTxn->request_count; // this is called after connection failures. So if we got here, we attempted a request
220-
221-
// This is not called on the first attempt.
222-
// Thus, if we got called here, we know it's because of a parent failure.
223-
// So immediately find the next parent, and set the response_action.
150+
++strategyTxn->request_count;
224151

225152
auto strategy = strategyTxn->strategy;
226153

227154
TSDebug(PLUGIN_NAME, "handle_os_dns got strategy '%s'", strategy->name());
228155

229-
mark_response(txnp, strategyTxn, STATUS_CONNECTION_FAILURE);
230-
231-
// now, we need to figure out, are we the first call after send_response set the response_action as the next-thing-to-try,
232-
// or are we a subsequent call, and need to actually set a new response_action
233-
234-
if (strategyTxn->prev_host != nullptr) {
235-
TSDebug(PLUGIN_NAME, "handle_os_dns had prev, keeping existing response_action and un-setting prev");
236-
// if strategyTxn->prev_host exists, this is the very first call after send_response set the response_action to the next thing
237-
// to try. and no handle_response was called in-between (because it was a connection or dns failure) So keep that, and set
238-
// prev_host=nullptr (so we get a new response_action the next time we're called)
239-
strategyTxn->prev_host = nullptr;
240-
strategyTxn->prev_port = 0;
241-
strategyTxn->prev_is_retry = false;
242-
strategyTxn->prev_no_cache = false;
243-
TSHttpTxnReenable(txnp, TS_EVENT_HTTP_CONTINUE);
244-
return TS_SUCCESS;
156+
const TSServerState server_state = TSHttpTxnServerStateGet(txnp);
157+
if (server_state == TS_SRVSTATE_CONNECTION_ERROR || server_state == TS_SRVSTATE_INACTIVE_TIMEOUT) {
158+
mark_response(txnp, strategyTxn, STATUS_CONNECTION_FAILURE);
245159
}
246160

247161
TSDebug(PLUGIN_NAME, "handle_os_dns had no prev, setting new response_action");
248162

163+
{
164+
TSResponseAction ra;
165+
TSHttpTxnResponseActionGet(txnp, &ra);
166+
strategyTxn->prev_ra = ra;
167+
}
168+
249169
TSResponseAction ra;
250-
memset(&ra, 0, sizeof(TSResponseAction)); // because {0} gives a C++ warning. Ugh.
251-
const char *const exclude_host = nullptr;
252-
const size_t exclude_host_len = 0;
253-
const in_port_t exclude_port = 0;
170+
memset(&ra, 0, sizeof(TSResponseAction));
171+
const char *const exclude_host = strategyTxn->prev_ra.hostname;
172+
const size_t exclude_host_len = strategyTxn->prev_ra.hostname_len;
173+
const in_port_t exclude_port = strategyTxn->prev_ra.port;
254174
strategy->next(txnp, strategyTxn->txn, exclude_host, exclude_host_len, exclude_port, &ra.hostname, &ra.hostname_len, &ra.port,
255175
&ra.is_retry, &ra.no_cache);
256176

257177
ra.fail = ra.hostname == nullptr; // failed is whether to immediately fail and return the client a 502. In this case: whether or
258178
// not we found another parent.
259179
ra.nextHopExists = ra.hostname_len != 0;
260-
ra.responseIsRetryable = strategy->responseIsRetryable(strategyTxn->request_count, STATUS_CONNECTION_FAILURE);
180+
ra.responseIsRetryable = strategy->responseIsRetryable(strategyTxn->request_count - 1, STATUS_CONNECTION_FAILURE);
261181
ra.goDirect = strategy->goDirect();
262182
ra.parentIsProxy = strategy->parentIsProxy();
263183
TSDebug(PLUGIN_NAME, "handle_os_dns setting response_action hostname '%.*s' port %d direct %d proxy %d is_retry %d exists %d",
@@ -298,14 +218,8 @@ handle_hook(TSCont contp, TSEvent event, void *edata)
298218
TSDebug(PLUGIN_NAME, "handle_hook got strategy '%s'", strategyTxn->strategy->name());
299219

300220
switch (event) {
301-
// case TS_EVENT_HTTP_READ_REQUEST_HDR:
302-
// return handle_read_request(txnp, strategyTxn);
303-
case TS_EVENT_HTTP_SEND_REQUEST_HDR:
304-
return handle_send_request(txnp, strategyTxn);
305221
case TS_EVENT_HTTP_READ_RESPONSE_HDR:
306222
return handle_read_response(txnp, strategyTxn);
307-
// case TS_EVENT_HTTP_SEND_RESPONSE_HDR:
308-
// return handle_send_response(txnp, strategyTxn);
309223
case TS_EVENT_HTTP_OS_DNS:
310224
return handle_os_dns(txnp, strategyTxn);
311225
case TS_EVENT_HTTP_TXN_CLOSE:
@@ -417,16 +331,10 @@ TSRemapDoRemap(void *ih, TSHttpTxn txnp, TSRemapRequestInfo *rri)
417331
strategyTxn->strategy = strategy;
418332
strategyTxn->txn = strategy->newTxn();
419333
strategyTxn->request_count = 0;
420-
strategyTxn->prev_host = nullptr;
421-
strategyTxn->prev_port = 0;
422-
strategyTxn->prev_is_retry = false;
423-
strategyTxn->prev_no_cache = false;
334+
memset(&strategyTxn->prev_ra, 0, sizeof(TSResponseAction));
424335
TSContDataSet(cont, (void *)strategyTxn);
425336

426-
// TSHttpTxnHookAdd(txnp, TS_HTTP_READ_REQUEST_HDR_HOOK, cont);
427-
TSHttpTxnHookAdd(txnp, TS_HTTP_SEND_REQUEST_HDR_HOOK, cont);
428337
TSHttpTxnHookAdd(txnp, TS_HTTP_READ_RESPONSE_HDR_HOOK, cont);
429-
// TSHttpTxnHookAdd(txnp, TS_HTTP_SEND_RESPONSE_HDR_HOOK, cont);
430338
TSHttpTxnHookAdd(txnp, TS_HTTP_OS_DNS_HOOK, cont);
431339
TSHttpTxnHookAdd(txnp, TS_HTTP_TXN_CLOSE_HOOK, cont);
432340

proxy/http/HttpTransact.cc

Lines changed: 17 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -1753,11 +1753,21 @@ HttpTransact::HandleApiErrorJump(State *s)
17531753
return;
17541754
}
17551755

1756-
// PPDNSLookupAPICall does an API callout, then calls PPDNSLookup
1756+
// PPDNSLookupAPICall does an API callout if a plugin set the response_action,
1757+
// then calls PPDNSLookup.
1758+
// This is to preserve plugin hook calling behavior pre-9, which didn't call
1759+
// the TS_HTTP_OS_DNS_HOOK on PPDNSLookup.
1760+
// Since response_action is new in 9, only new plugins intentionally setting
1761+
// it will have the new behavior of TS_HTTP_OS_DNS_HOOK firing on PPDNSLookup.
17571762
void
17581763
HttpTransact::PPDNSLookupAPICall(State *s)
17591764
{
1760-
TRANSACT_RETURN(SM_ACTION_API_OS_DNS, PPDNSLookup);
1765+
TxnDebug("http_trans", "[HttpTransact::PPDNSLookupAPICall] response_action.handled %d", s->response_action.handled);
1766+
if (!s->response_action.handled) {
1767+
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
1768+
} else {
1769+
TRANSACT_RETURN(SM_ACTION_API_OS_DNS, PPDNSLookup);
1770+
}
17611771
}
17621772

17631773
///////////////////////////////////////////////////////////////////////////////
@@ -1798,11 +1808,7 @@ HttpTransact::PPDNSLookup(State *s)
17981808

17991809
if (!s->current.server->dst_addr.isValid()) {
18001810
if (s->current.request_to == PARENT_PROXY) {
1801-
if (!s->response_action.handled) {
1802-
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
1803-
} else {
1804-
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
1805-
}
1811+
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
18061812
} else if (s->parent_result.result == PARENT_DIRECT && s->http_config_param->no_dns_forward_to_parent != 1) {
18071813
// We ran out of parents but parent configuration allows us to go to Origin Server directly
18081814
CallOSDNSLookup(s);
@@ -2243,7 +2249,7 @@ HttpTransact::LookupSkipOpenServer(State *s)
22432249
find_server_and_update_current_info(s);
22442250

22452251
if (s->current.request_to == PARENT_PROXY) {
2246-
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
2252+
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
22472253
} else if (s->parent_result.result == PARENT_FAIL) {
22482254
handle_parent_died(s);
22492255
return;
@@ -2950,7 +2956,7 @@ HttpTransact::HandleCacheOpenReadHit(State *s)
29502956
ink_assert(s->pending_work == nullptr);
29512957
s->pending_work = issue_revalidate;
29522958

2953-
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
2959+
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
29542960
} else if (s->current.request_to == ORIGIN_SERVER) {
29552961
return CallOSDNSLookup(s);
29562962
} else {
@@ -3377,7 +3383,7 @@ HttpTransact::HandleCacheOpenReadMiss(State *s)
33773383
return CallOSDNSLookup(s);
33783384
}
33793385
if (s->current.request_to == PARENT_PROXY) {
3380-
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, HttpTransact::PPDNSLookup);
3386+
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, HttpTransact::PPDNSLookupAPICall);
33813387
} else {
33823388
handle_parent_died(s);
33833389
return;
@@ -3746,7 +3752,7 @@ HttpTransact::handle_response_from_parent(State *s)
37463752
switch (next_lookup) {
37473753
case PARENT_PROXY:
37483754
ink_assert(s->current.request_to == PARENT_PROXY);
3749-
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookup);
3755+
TRANSACT_RETURN(SM_ACTION_DNS_LOOKUP, PPDNSLookupAPICall);
37503756
break;
37513757
case ORIGIN_SERVER:
37523758
// Next lookup is Origin Server, try DNS for Origin Server

0 commit comments

Comments
 (0)