Skip to content

Commit 6aae43c

Browse files
committed
fix: refine scheduler timing and lateness warnings
- Treat Event.scheduled as the nominal start time and compute the next nominal fire time via scheduled_next = scheduled + interval, keeping a fixed-rate schedule grid without accumulating jitter. - Rework QueueEvent() to requeue at scheduled_next when the job finishes on time or early, and to shift the nextqueue to now+10ms only when the nominal time was already missed (same as before, but using Debug instead of a Warning message) - Measure lateness at dispatch time in SchedThread() and emit a Warning only when the start delay exceeds a 100 ms tolerance. This message replaces the previous generic (and often repeated) “last execution overlaps with scheduled execution” warning. - Add optional (commented-out) diagnostic logging in QueueEvent() and SchedThread() to trace nominal schedule times, actual wakeups, and cond wait results when investigating scheduler behavior. It turns out that at least under macOS, the actual timer granularity of pthread_cond_timedwait() is quite bad (overshooting often 3-5 ms)
1 parent 45ff532 commit 6aae43c

File tree

1 file changed

+82
-20
lines changed

1 file changed

+82
-20
lines changed

nsd/sched.c

Lines changed: 82 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -325,9 +325,7 @@ Ns_ScheduleProcEx(Ns_SchedProc *proc, void *clientData, unsigned int flags,
325325
} else {
326326
Event *ePtr;
327327
int isNew;
328-
Ns_Time now;
329328

330-
Ns_GetTime(&now);
331329
ePtr = ns_malloc(sizeof(Event));
332330
ePtr->flags = flags;
333331
ePtr->nextqueue.sec = 0;
@@ -344,6 +342,8 @@ Ns_ScheduleProcEx(Ns_SchedProc *proc, void *clientData, unsigned int flags,
344342
id = (int)NS_ERROR;
345343
ns_free(ePtr);
346344
} else {
345+
Ns_Time now;
346+
347347
do {
348348
static int nextId = 0;
349349

@@ -354,6 +354,7 @@ Ns_ScheduleProcEx(Ns_SchedProc *proc, void *clientData, unsigned int flags,
354354
ePtr->hPtr = Tcl_CreateHashEntry(&eventsTable, INT2PTR(id), &isNew);
355355
} while (isNew == 0);
356356
Tcl_SetHashValue(ePtr->hPtr, ePtr);
357+
Ns_GetTime(&now);
357358
ePtr->id = id;
358359
ePtr->scheduled = now;
359360
QueueEvent(ePtr);
@@ -683,32 +684,64 @@ QueueEvent(Event *ePtr)
683684
}
684685
ePtr->scheduled = ePtr->nextqueue;
685686
} else {
686-
Ns_Time diff, now;
687+
Ns_Time scheduled_next, now;
688+
long cmp;
687689

688-
ePtr->nextqueue = ePtr->scheduled;
689-
Ns_IncrTime(&ePtr->nextqueue, ePtr->interval.sec, ePtr->interval.usec);
690690
/*
691-
* The update time is the next scheduled time.
691+
* "scheduled" : desired time for this run.
692+
* "scheduled_next" : desired time for the next run.
692693
*/
693-
ePtr->scheduled = ePtr->nextqueue;
694+
scheduled_next = ePtr->scheduled;
695+
696+
/* Compute planed next run: scheduled + interval */
697+
ePtr->nextqueue = ePtr->scheduled;
698+
Ns_IncrTime(&scheduled_next, ePtr->interval.sec, ePtr->interval.usec);
694699

700+
/*
701+
* If the job finished before its next nominal start, queue
702+
* exactly at scheduled_next. Otherwise we’d overlap: push the
703+
* next run to now + 10ms.
704+
*/
695705
Ns_GetTime(&now);
696-
d = Ns_DiffTime(&ePtr->nextqueue, &now, &diff);
697-
Ns_Log(Debug, "sched: compute next run time based on: scheduled " NS_TIME_FMT
698-
" diff %ld",
699-
(int64_t)ePtr->scheduled.sec, ePtr->scheduled.usec, d);
700706

701-
if (d == -1) {
707+
cmp = Ns_DiffTime(&scheduled_next, &now, NULL);
708+
709+
if (cmp == 1 || cmp == 0) {
710+
/*
711+
* scheduled_next >= now: we are on time or early
712+
*/
713+
ePtr->nextqueue = scheduled_next;
714+
} else {
702715
/*
703-
* The last execution took longer than the schedule
704-
* interval. Re-schedule after 10ms.
716+
* scheduled_next < now: we already missed the nominal time
705717
*/
706718
ePtr->nextqueue = now;
707719
Ns_IncrTime(&ePtr->nextqueue, 0, 10000);
708-
Ns_Log(Warning, "sched id %d: last execution overlaps with scheduled execution; "
709-
"running late", ePtr->id);
720+
Ns_Log(Debug, "job %d: we missed nominal time", ePtr->id);
710721
}
722+
/*Ns_Log(Notice,
723+
"QueueEvent id %d: now " NS_TIME_FMT " scheduled " NS_TIME_FMT " nextqueue " NS_TIME_FMT,
724+
ePtr->id,
725+
(int64_t)now.sec, now.usec,
726+
(int64_t)ePtr->scheduled.sec, ePtr->scheduled.usec,
727+
(int64_t)ePtr->nextqueue.sec, ePtr->nextqueue.usec);*/
728+
729+
/* store the nominal time for the NEXT run */
730+
ePtr->scheduled = scheduled_next;
711731
}
732+
#if 0
733+
ns_pause 0
734+
ns_schedule_proc 1s {ns_log notice ping}
735+
ns_schedule_proc 1s {ns_sleep 2s}
736+
737+
ns_schedule_proc 1s {ns_log notice ping}
738+
ns_schedule_proc 3s {ns_sleep 5s}
739+
740+
ns_schedule_proc 100ms {ns_sleep 50ms}
741+
742+
ns_schedule_proc 1s {ns_sleep 2s}
743+
ns_schedule_proc -thread 1s {ns_sleep 2s}
744+
#endif
712745

713746
ePtr->qid = ++nqueue;
714747
/*
@@ -972,16 +1005,36 @@ SchedThread(void *UNUSED(arg))
9721005
* For events ready to run, either create a thread for
9731006
* detached events or add to a list of synchronous events.
9741007
*/
975-
9761008
Ns_GetTime(&now);
9771009
while (nqueue > 0 && Ns_DiffTime(&queue[1]->nextqueue, &now, NULL) <= 0) {
1010+
Ns_Time late, tolerate = {0, 100000}; /* 100 ms */
1011+
9781012
ePtr = DeQueueEvent(1);
9791013

9801014
#ifdef NS_SCHED_TRACE_EVENTS
9811015
Ns_Log(Notice, "... dequeue event (id %d) " NS_TIME_FMT,
9821016
ePtr->id,
9831017
(int64_t)ePtr->nextqueue.sec, ePtr->nextqueue.usec);
9841018
#endif
1019+
/*
1020+
* Measure how late we are for THIS execution
1021+
*/
1022+
if (Ns_DiffTime(&now, &ePtr->scheduled, &late) == 1) { /* now > scheduled */
1023+
/*Ns_Log(Notice,
1024+
"at "NS_TIME_FMT ": sched id %d: job started late by %ld.%06ld sec (interval %ld.%06ld sec)",
1025+
(long long)now.sec, (long)now.usec,
1026+
ePtr->id,
1027+
(long)late.sec, (long)late.usec,
1028+
(long)ePtr->interval.sec, (long)ePtr->interval.usec);*/
1029+
1030+
if (Ns_DiffTime(&late, &tolerate, NULL) == 1) { /* late > tolerate */
1031+
Ns_Log(Warning,
1032+
"sched id %d: job started late by %ld.%06ld sec (interval %ld.%06ld sec)",
1033+
ePtr->id,
1034+
(long)late.sec, (long)late.usec,
1035+
(long)ePtr->interval.sec, (long)ePtr->interval.usec);
1036+
}
1037+
}
9851038
if ((ePtr->flags & NS_SCHED_ONCE) != 0u) {
9861039
Tcl_DeleteHashEntry(ePtr->hPtr);
9871040
ePtr->hPtr = NULL;
@@ -1057,17 +1110,26 @@ SchedThread(void *UNUSED(arg))
10571110
QueueEvent(ePtr);
10581111
}
10591112
}
1060-
10611113
/*
10621114
* Wait for the next ready event.
10631115
*/
10641116
if (nqueue == 0) {
10651117
Ns_CondWait(&schedcond, &lock);
10661118
} else if (!shutdownPending) {
1119+
Ns_ReturnCode status;
1120+
10671121
timeout = queue[1]->nextqueue;
1068-
(void) Ns_CondTimedWait(&schedcond, &lock, &timeout);
1122+
/*Ns_Log(Notice, "after job next queue timeout " NS_TIME_FMT,
1123+
(int64_t)timeout.sec, timeout.usec);*/
1124+
status = Ns_CondTimedWait(&schedcond, &lock, &timeout);
1125+
(void)status;
1126+
/*{
1127+
Ns_Time now;
1128+
Ns_GetTime(&now);
1129+
Ns_Log(Notice, "at " NS_TIME_FMT ": CondTimedWait returned %s",
1130+
(int64_t)now.sec, now.usec, Ns_ReturnCodeString(status));
1131+
}*/
10691132
}
1070-
10711133
}
10721134

10731135
/*

0 commit comments

Comments
 (0)