1414from pyth_observer .event import DatadogEvent # Used dynamically
1515from pyth_observer .event import LogEvent # Used dynamically
1616from pyth_observer .event import TelegramEvent # Used dynamically
17- from pyth_observer .event import ZendutyEvent # Used dynamically
18- from pyth_observer .event import Event
17+ from pyth_observer .event import Context , Event , ZendutyEvent
1918from pyth_observer .zenduty import send_zenduty_alert
2019
2120assert DatadogEvent
@@ -46,6 +45,9 @@ def __init__(self, config, publishers):
4645 if "ZendutyEvent" in self .config ["events" ]:
4746 self .open_alerts_file = os .environ ["OPEN_ALERTS_FILE" ]
4847 self .open_alerts = self .load_alerts ()
48+ # below is used to store events to later send if mutilple failures occur
49+ # events cannot be stored in open_alerts as they are not JSON serializable.
50+ self .zenduty_events = {}
4951
5052 def load_alerts (self ):
5153 try :
@@ -68,17 +70,14 @@ async def run(self, states: List[State]):
6870
6971 # Then, wrap each failed check in events and send them
7072 sent_events : List [Awaitable ] = []
71- context = {
72- "network" : self .config ["network" ]["name" ],
73- "publishers" : self .publishers ,
74- }
73+ context = Context (
74+ network = self .config ["network" ]["name" ], publishers = self .publishers
75+ )
7576
7677 for check in failed_checks :
7778 for event_type in self .config ["events" ]:
7879 event : Event = globals ()[event_type ](check , context )
7980
80- sent_events .append (event .send ())
81-
8281 if event_type == "ZendutyEvent" :
8382 # Add failed check to open alerts
8483 alert_identifier = (
@@ -87,28 +86,45 @@ async def run(self, states: List[State]):
8786 state = check .state ()
8887 if isinstance (state , PublisherState ):
8988 alert_identifier += f"-{ state .publisher_name } "
90- self .open_alerts [alert_identifier ] = datetime .now ().isoformat ()
89+ try :
90+ failures = self .open_alerts [alert_identifier ]["failures" ] + 1
91+ except KeyError :
92+ failures = 1
93+ self .open_alerts [alert_identifier ] = {
94+ "last_failure" : datetime .now ().isoformat (),
95+ "failures" : failures ,
96+ }
97+ # store the event to send it later if it fails multiple times
98+ self .zenduty_events [alert_identifier ] = event
99+ continue # do not immediately send a zenduty alert
100+
101+ sent_events .append (event .send ())
91102
92103 await asyncio .gather (* sent_events )
93104
94- # Check open alerts and resolve those that are older than 2 minutes
105+ # Check open alerts for zenduty
95106 if "ZendutyEvent" in self .config ["events" ]:
96107
97108 to_remove = []
98109 current_time = datetime .now ()
99- for identifier , last_failure in self .open_alerts .items ():
100- if current_time - datetime .fromisoformat (last_failure ) >= timedelta (
101- minutes = 2
102- ):
110+ for identifier , info in self .open_alerts .items ():
111+ # Resolve the alert if it last failed > 2 minutes ago
112+ if current_time - datetime .fromisoformat (
113+ info ["last_failure" ]
114+ ) >= timedelta (minutes = 2 ):
103115 logger .debug (f"Resolving Zenduty alert { identifier } " )
104116 response = await send_zenduty_alert (
105117 alert_identifier = identifier , message = identifier , resolved = True
106118 )
107119 if response and 200 <= response .status < 300 :
108120 to_remove .append (identifier )
121+ elif info ["failures" ] > 2 :
122+ # Raise alert if the check has failed more than twice before self-resolving
123+ await self .zenduty_events [identifier ].send ()
109124
110125 for identifier in to_remove :
111126 del self .open_alerts [identifier ]
127+ del self .zenduty_events [identifier ]
112128
113129 # Write open alerts to file to ensure persistence
114130 with open (self .open_alerts_file , "w" ) as file :
0 commit comments