88- `COSAgentProvider`: Use in machine charms that need to have a workload's metrics 
99  or logs scraped, or forward rule files or dashboards to Prometheus, Loki or Grafana through 
1010  the Grafana Agent machine charm. 
11+   NOTE: Be sure to add `limit: 1` in your charm for the cos-agent relation. That is the only 
12+    way we currently have to prevent two different grafana agent apps deployed on the same VM. 
1113
1214- `COSAgentConsumer`: Used in the Grafana Agent machine charm to manage the requirer side of 
1315  the `cos_agent` interface. 
2224Using the `COSAgentProvider` object only requires instantiating it, 
2325typically in the `__init__` method of your charm (the one which sends telemetry). 
2426
25- The constructor of `COSAgentProvider` has only one required and ten optional parameters: 
2627
2728```python 
2829    def __init__( 
@@ -233,8 +234,8 @@ def __init__(self, *args):
233234)
234235
235236import  pydantic 
236- from  cosl  import  GrafanaDashboard , JujuTopology 
237- from  cosl .rules  import  AlertRules 
237+ from  cosl  import  DashboardPath40UID , JujuTopology ,  LZMABase64 
238+ from  cosl .rules  import  AlertRules ,  generic_alert_groups 
238239from  ops .charm  import  RelationChangedEvent 
239240from  ops .framework  import  EventBase , EventSource , Object , ObjectEvents 
240241from  ops .model  import  ModelError , Relation 
@@ -253,9 +254,9 @@ class _MetricsEndpointDict(TypedDict):
253254
254255LIBID  =  "dc15fa84cef84ce58155fb84f6c6213a" 
255256LIBAPI  =  0 
256- LIBPATCH  =  11 
257+ LIBPATCH  =  20 
257258
258- PYDEPS  =  ["cosl" , "pydantic" ]
259+ PYDEPS  =  ["cosl >= 0.0.50 " , "pydantic" ]
259260
260261DEFAULT_RELATION_NAME  =  "cos-agent" 
261262DEFAULT_PEER_RELATION_NAME  =  "peers" 
@@ -267,7 +268,6 @@ class _MetricsEndpointDict(TypedDict):
267268logger  =  logging .getLogger (__name__ )
268269SnapEndpoint  =  namedtuple ("SnapEndpoint" , "owner, name" )
269270
270- 
271271# Note: MutableMapping is imported from the typing module and not collections.abc 
272272# because subscripting collections.abc.MutableMapping was added in python 3.9, but 
273273# most of our charms are based on 20.04, which has python 3.8. 
@@ -317,7 +317,11 @@ class NotReadyError(TracingError):
317317    """Raised by the provider wrapper if a requirer hasn't published the required data (yet).""" 
318318
319319
320- class  ProtocolNotRequestedError (TracingError ):
320+ class  ProtocolNotFoundError (TracingError ):
321+     """Raised if the user doesn't receive an endpoint for a protocol it requested.""" 
322+ 
323+ 
324+ class  ProtocolNotRequestedError (ProtocolNotFoundError ):
321325    """Raised if the user attempts to obtain an endpoint for a protocol it did not request.""" 
322326
323327
@@ -476,7 +480,7 @@ class CosAgentProviderUnitData(DatabagModel):
476480    # this needs to make its way to the gagent leader 
477481    metrics_alert_rules : dict 
478482    log_alert_rules : dict 
479-     dashboards : List [GrafanaDashboard ]
483+     dashboards : List [str ]
480484    # subordinate is no longer used but we should keep it until we bump the library to ensure 
481485    # we don't break compatibility. 
482486    subordinate : Optional [bool ] =  None 
@@ -509,7 +513,7 @@ class CosAgentPeersUnitData(DatabagModel):
509513    # of the outgoing o11y relations. 
510514    metrics_alert_rules : Optional [dict ]
511515    log_alert_rules : Optional [dict ]
512-     dashboards : Optional [List [GrafanaDashboard ]]
516+     dashboards : Optional [List [str ]]
513517
514518    # when this whole datastructure is dumped into a databag, it will be nested under this key. 
515519    # while not strictly necessary (we could have it 'flattened out' into the databag), 
@@ -579,7 +583,7 @@ class Receiver(pydantic.BaseModel):
579583    """Specification of an active receiver.""" 
580584
581585    protocol : ProtocolType  =  pydantic .Field (..., description = "Receiver protocol name and type." )
582-     url : str  =  pydantic .Field (
586+     url : Optional [ str ]  =  pydantic .Field (
583587        ...,
584588        description = """URL at which the receiver is reachable. If there's an ingress, it would be the external URL. 
585589        Otherwise, it would be the service's fqdn or internal IP. 
@@ -727,6 +731,10 @@ def _metrics_alert_rules(self) -> Dict:
727731            query_type = "promql" , topology = JujuTopology .from_charm (self ._charm )
728732        )
729733        alert_rules .add_path (self ._metrics_rules , recursive = self ._recursive )
734+         alert_rules .add (
735+             generic_alert_groups .application_rules ,
736+             group_name_prefix = JujuTopology .from_charm (self ._charm ).identifier ,
737+         )
730738        return  alert_rules .as_dict ()
731739
732740    @property  
@@ -737,12 +745,27 @@ def _log_alert_rules(self) -> Dict:
737745        return  alert_rules .as_dict ()
738746
739747    @property  
740-     def  _dashboards (self ) ->  List [GrafanaDashboard ]:
741-         dashboards : List [GrafanaDashboard ] =  []
748+     def  _dashboards (self ) ->  List [str ]:
749+         dashboards : List [str ] =  []
742750        for  d  in  self ._dashboard_dirs :
743751            for  path  in  Path (d ).glob ("*" ):
744-                 dashboard  =  GrafanaDashboard ._serialize (path .read_bytes ())
745-                 dashboards .append (dashboard )
752+                 with  open (path , "rt" ) as  fp :
753+                     dashboard  =  json .load (fp )
754+                 rel_path  =  str (
755+                     path .relative_to (self ._charm .charm_dir ) if  path .is_absolute () else  path 
756+                 )
757+                 # COSAgentProvider is somewhat analogous to GrafanaDashboardProvider. We need to overwrite the uid here 
758+                 # because there is currently no other way to communicate the dashboard path separately. 
759+                 # https://github.com/canonical/grafana-k8s-operator/pull/363 
760+                 dashboard ["uid" ] =  DashboardPath40UID .generate (self ._charm .meta .name , rel_path )
761+ 
762+                 # Add tags 
763+                 tags : List [str ] =  dashboard .get ("tags" , [])
764+                 if  not  any (tag .startswith ("charm: " ) for  tag  in  tags ):
765+                     tags .append (f"charm: { self ._charm .meta .name }  )
766+                 dashboard ["tags" ] =  tags 
767+ 
768+                 dashboards .append (LZMABase64 .compress (json .dumps (dashboard )))
746769        return  dashboards 
747770
748771    @property  
@@ -768,7 +791,7 @@ def is_ready(self, relation: Optional[Relation] = None):
768791        """Is this endpoint ready?""" 
769792        relation  =  relation  or  self ._relation 
770793        if  not  relation :
771-             logger .debug (f"no relation on { self ._relation_name   !r}  )
794+             logger .debug (f"no relation on { self ._relation_name !r}  )
772795            return  False 
773796        if  relation .data  is  None :
774797            logger .error (f"relation data is None for { relation }  )
@@ -802,29 +825,48 @@ def get_all_endpoints(
802825
803826    def  _get_tracing_endpoint (
804827        self , relation : Optional [Relation ], protocol : ReceiverProtocol 
805-     ) ->  Optional [str ]:
828+     ) ->  str :
829+         """Return a tracing endpoint URL if it is available or raise a ProtocolNotFoundError.""" 
806830        unit_data  =  self .get_all_endpoints (relation )
807831        if  not  unit_data :
808-             return  None 
832+             # we didn't find the protocol because the remote end didn't publish any data yet 
833+             # it might also mean that grafana-agent doesn't have a relation to the tracing backend 
834+             raise  ProtocolNotFoundError (protocol )
809835        receivers : List [Receiver ] =  [i  for  i  in  unit_data .receivers  if  i .protocol .name  ==  protocol ]
810836        if  not  receivers :
811-             logger .error (f"no receiver found with protocol={ protocol !r}  )
812-             return  None 
837+             # we didn't find the protocol because grafana-agent didn't return us the protocol that we requested 
838+             # the caller might want to verify that we did indeed request this protocol 
839+             raise  ProtocolNotFoundError (protocol )
813840        if  len (receivers ) >  1 :
814-             logger .error (
841+             logger .warning (
815842                f"too many receivers with protocol={ protocol !r} { receivers }  
816843            )
817-             return  None 
818844
819845        receiver  =  receivers [0 ]
846+         if  not  receiver .url :
847+             # grafana-agent isn't connected to the tracing backend yet 
848+             raise  ProtocolNotFoundError (protocol )
820849        return  receiver .url 
821850
822851    def  get_tracing_endpoint (
823852        self , protocol : ReceiverProtocol , relation : Optional [Relation ] =  None 
824-     ) ->  Optional [str ]:
825-         """Receiver endpoint for the given protocol.""" 
826-         endpoint  =  self ._get_tracing_endpoint (relation  or  self ._relation , protocol = protocol )
827-         if  not  endpoint :
853+     ) ->  str :
854+         """Receiver endpoint for the given protocol. 
855+ 
856+         It could happen that this function gets called before the provider publishes the endpoints. 
857+         In such a scenario, if a non-leader unit calls this function, a permission denied exception will be raised due to 
858+         restricted access. To prevent this, this function needs to be guarded by the `is_ready` check. 
859+ 
860+         Raises: 
861+         ProtocolNotRequestedError: 
862+             If the charm unit is the leader unit and attempts to obtain an endpoint for a protocol it did not request. 
863+         ProtocolNotFoundError: 
864+             If the charm attempts to obtain an endpoint when grafana-agent isn't related to a tracing backend. 
865+         """ 
866+         try :
867+             return  self ._get_tracing_endpoint (relation  or  self ._relation , protocol = protocol )
868+         except  ProtocolNotFoundError :
869+             # let's see if we didn't find it because we didn't request the endpoint 
828870            requested_protocols  =  set ()
829871            relations  =  [relation ] if  relation  else  self .relations 
830872            for  relation  in  relations :
@@ -839,8 +881,7 @@ def get_tracing_endpoint(
839881            if  protocol  not  in requested_protocols :
840882                raise  ProtocolNotRequestedError (protocol , relation )
841883
842-             return  None 
843-         return  endpoint 
884+             raise 
844885
845886
846887class  COSAgentDataChanged (EventBase ):
@@ -902,6 +943,8 @@ def __init__(
902943            events .relation_joined , self ._on_relation_data_changed 
903944        )  # TODO: do we need this? 
904945        self .framework .observe (events .relation_changed , self ._on_relation_data_changed )
946+         self .framework .observe (events .relation_departed , self ._on_relation_departed )
947+ 
905948        for  event  in  self ._refresh_events :
906949            self .framework .observe (event , self .trigger_refresh )  # pyright: ignore 
907950
@@ -929,6 +972,26 @@ def _on_peer_relation_changed(self, _):
929972        if  self ._charm .unit .is_leader ():
930973            self .on .data_changed .emit ()  # pyright: ignore 
931974
975+     def  _on_relation_departed (self , event ):
976+         """Remove provider's (principal's) alert rules and dashboards from peer data when the cos-agent relation to the principal is removed.""" 
977+         if  not  self .peer_relation :
978+             event .defer ()
979+             return 
980+         # empty the departing unit's alert rules and dashboards from peer data 
981+         data  =  CosAgentPeersUnitData (
982+             unit_name = event .unit .name ,
983+             relation_id = str (event .relation .id ),
984+             relation_name = event .relation .name ,
985+             metrics_alert_rules = {},
986+             log_alert_rules = {},
987+             dashboards = [],
988+         )
989+         self .peer_relation .data [self ._charm .unit ][
990+             f"{ CosAgentPeersUnitData .KEY } { event .unit .name }  
991+         ] =  data .json ()
992+ 
993+         self .on .data_changed .emit ()  # pyright: ignore 
994+ 
932995    def  _on_relation_data_changed (self , event : RelationChangedEvent ):
933996        # Peer data is the only means of communication between subordinate units. 
934997        if  not  self .peer_relation :
@@ -988,7 +1051,16 @@ def update_tracing_receivers(self):
9881051                CosAgentRequirerUnitData (
9891052                    receivers = [
9901053                        Receiver (
991-                             url = f"{ self ._get_tracing_receiver_url (protocol )}  ,
1054+                             # if tracing isn't ready, we don't want the wrong receiver URLs present in the databag. 
1055+                             # however, because of the backwards compatibility requirements, we need to still provide 
1056+                             # the protocols list so that the charm with older cos_agent version doesn't error its hooks. 
1057+                             # before this change was added, the charm with old cos_agent version threw exceptions with 
1058+                             # connections to grafana-agent timing out. After the change, the charm will fail validating 
1059+                             # databag contents (as it expects a string in URL) but that won't cause any errors as 
1060+                             # tracing endpoints are the only content in the grafana-agent's side of the databag. 
1061+                             url = f"{ self ._get_tracing_receiver_url (protocol )}  
1062+                             if  self ._charm .tracing .is_ready ()  # type: ignore 
1063+                             else  None ,
9921064                            protocol = ProtocolType (
9931065                                name = protocol ,
9941066                                type = receiver_protocol_to_transport_protocol [protocol ],
@@ -1030,8 +1102,7 @@ def _get_requested_protocols(self, relation: Relation):
10301102        if  len (units ) >  1 :
10311103            # should never happen 
10321104            raise  ValueError (
1033-                 f"unexpected error: subordinate relation { relation }  
1034-                 f"should have exactly one unit" 
1105+                 f"unexpected error: subordinate relation { relation }  
10351106            )
10361107
10371108        unit  =  next (iter (units ), None )
@@ -1287,7 +1358,7 @@ def dashboards(self) -> List[Dict[str, str]]:
12871358            seen_apps .append (app_name )
12881359
12891360            for  encoded_dashboard  in  data .dashboards  or  ():
1290-                 content  =  GrafanaDashboard ( encoded_dashboard ). _deserialize ( )
1361+                 content  =  json . loads ( LZMABase64 . decompress ( encoded_dashboard ) )
12911362
12921363                title  =  content .get ("title" , "no_title" )
12931364
@@ -1314,44 +1385,32 @@ def charm_tracing_config(
13141385    If https endpoint is provided but cert_path is not found on disk: 
13151386     disable charm tracing. 
13161387    If https endpoint is provided and cert_path is None: 
1317-      ERROR  
1388+      raise TracingError  
13181389    Else: 
13191390     proceed with charm tracing (with or without tls, as appropriate) 
13201391
13211392    Usage: 
1322-       If you are using charm_tracing >= v1.9: 
1323-     >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm 
1324-     >>> from lib.charms.tempo_k8s.v0.cos_agent import charm_tracing_config 
1393+     >>> from lib.charms.tempo_coordinator_k8s.v0.charm_tracing import trace_charm 
1394+     >>> from lib.charms.tempo_coordinator_k8s.v0.tracing import charm_tracing_config 
13251395    >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") 
13261396    >>> class MyCharm(...): 
13271397    >>>     _cert_path = "/path/to/cert/on/charm/container.crt" 
13281398    >>>     def __init__(self, ...): 
1329-     >>>         self.cos_agent  = COSAgentProvider (...) 
1399+     >>>         self.tracing  = TracingEndpointRequirer (...) 
13301400    >>>         self.my_endpoint, self.cert_path = charm_tracing_config( 
1331-     ...             self.cos_agent, self._cert_path) 
1332- 
1333-       If you are using charm_tracing < v1.9: 
1334-     >>> from lib.charms.tempo_k8s.v1.charm_tracing import trace_charm 
1335-     >>> from lib.charms.tempo_k8s.v2.tracing import charm_tracing_config 
1336-     >>> @trace_charm(tracing_endpoint="my_endpoint", cert_path="cert_path") 
1337-     >>> class MyCharm(...): 
1338-     >>>     _cert_path = "/path/to/cert/on/charm/container.crt" 
1339-     >>>     def __init__(self, ...): 
1340-     >>>         self.cos_agent = COSAgentProvider(...) 
1341-     >>>         self.my_endpoint, self.cert_path = charm_tracing_config( 
1342-     ...             self.cos_agent, self._cert_path) 
1343-     >>>     @property 
1344-     >>>     def my_endpoint(self): 
1345-     >>>         return self._my_endpoint 
1346-     >>>     @property 
1347-     >>>     def cert_path(self): 
1348-     >>>         return self._cert_path 
1349- 
1401+     ...             self.tracing, self._cert_path) 
13501402    """ 
13511403    if  not  endpoint_requirer .is_ready ():
13521404        return  None , None 
13531405
1354-     endpoint  =  endpoint_requirer .get_tracing_endpoint ("otlp_http" )
1406+     try :
1407+         endpoint  =  endpoint_requirer .get_tracing_endpoint ("otlp_http" )
1408+     except  ProtocolNotFoundError :
1409+         logger .warn (
1410+             "Endpoint for tracing wasn't provided as tracing backend isn't ready yet. If grafana-agent isn't connected to a tracing backend, integrate it. Otherwise this issue should resolve itself in a few events." 
1411+         )
1412+         return  None , None 
1413+ 
13551414    if  not  endpoint :
13561415        return  None , None 
13571416
0 commit comments