Fix: Enhance telemetry cleanup to prevent agent termination hang

github-actions[bot] · Mervin Praison · github-actions[bot] · commit f345477cdecc · 2025-07-18T16:04:37.000Z
- Added comprehensive telemetry cleanup calls in agent.py at all return points - Ensures proper cleanup after guardrail validation failures - Prevents hanging during agent termination by forcing telemetry shutdown - Added test files to validate telemetry cleanup functionality - Addresses hanging issues in async agent execution flows 🤖 Generated with [Claude Code](https://claude.ai/code) Co-Authored-By: Mervin Praison <mervin@praison.ai>
diff --git a/src/praisonai-agents/praisonaiagents/agent/agent.py b/src/praisonai-agents/praisonaiagents/agent/agent.py
@@ -1491,11 +1491,15 @@ def __init__(self, data):
                                     validated_response = self._apply_guardrail_with_retry(response_text, original_prompt, temperature, tools, task_name, task_description, task_id)
                                     # Execute callback after validation
                                     self._execute_callback_and_display(original_prompt, validated_response, time.time() - start_time, task_name, task_description, task_id)
+                                    # Ensure proper cleanup of telemetry system to prevent hanging
+                                    self._cleanup_telemetry()
                                     return validated_response
                                 except Exception as e:
                                     logging.error(f"Agent {self.name}: Guardrail validation failed after reflection: {e}")
                                     # Rollback chat history on guardrail failure
                                     self.chat_history = self.chat_history[:chat_history_length]
+                                    # Ensure proper cleanup of telemetry system to prevent hanging
+                                    self._cleanup_telemetry()
                                     return None
 
                             # Check if we've hit max reflections
@@ -1509,11 +1513,15 @@ def __init__(self, data):
                                     validated_response = self._apply_guardrail_with_retry(response_text, original_prompt, temperature, tools, task_name, task_description, task_id)
                                     # Execute callback after validation
                                     self._execute_callback_and_display(original_prompt, validated_response, time.time() - start_time, task_name, task_description, task_id)
+                                    # Ensure proper cleanup of telemetry system to prevent hanging
+                                    self._cleanup_telemetry()
                                     return validated_response
                                 except Exception as e:
                                     logging.error(f"Agent {self.name}: Guardrail validation failed after max reflections: {e}")
                                     # Rollback chat history on guardrail failure
                                     self.chat_history = self.chat_history[:chat_history_length]
+                                    # Ensure proper cleanup of telemetry system to prevent hanging
+                                    self._cleanup_telemetry()
                                     return None
                             
                             # If not satisfactory and not at max reflections, continue with regeneration
@@ -1646,11 +1654,15 @@ async def achat(self, prompt: str, temperature=0.2, tools=None, output_json=None
                         validated_response = self._apply_guardrail_with_retry(response_text, prompt, temperature, tools, task_name, task_description, task_id)
                         # Execute callback after validation
                         self._execute_callback_and_display(normalized_content, validated_response, time.time() - start_time, task_name, task_description, task_id)
+                        # Ensure proper cleanup of telemetry system to prevent hanging
+                        self._cleanup_telemetry()
                         return validated_response
                     except Exception as e:
                         logging.error(f"Agent {self.name}: Guardrail validation failed for custom LLM: {e}")
                         # Rollback chat history on guardrail failure
                         self.chat_history = self.chat_history[:chat_history_length]
+                        # Ensure proper cleanup of telemetry system to prevent hanging
+                        self._cleanup_telemetry()
                         return None
                 except Exception as e:
                     # Rollback chat history if LLM call fails
@@ -1726,6 +1738,8 @@ async def achat(self, prompt: str, temperature=0.2, tools=None, output_json=None
                             logging.debug(f"Agent.achat completed in {total_time:.2f} seconds")
                         # Execute callback after tool completion
                         self._execute_callback_and_display(original_prompt, result, time.time() - start_time, task_name, task_description, task_id)
+                        # Ensure proper cleanup of telemetry system to prevent hanging
+                        self._cleanup_telemetry()
                         return result
                     elif output_json or output_pydantic:
                         response = await self._openai_client.async_client.chat.completions.create(
@@ -1740,6 +1754,8 @@ async def achat(self, prompt: str, temperature=0.2, tools=None, output_json=None
                             logging.debug(f"Agent.achat completed in {total_time:.2f} seconds")
                         # Execute callback after JSON/Pydantic completion
                         self._execute_callback_and_display(original_prompt, response_text, time.time() - start_time, task_name, task_description, task_id)
+                        # Ensure proper cleanup of telemetry system to prevent hanging
+                        self._cleanup_telemetry()
                         return response_text
                     else:
                         response = await self._openai_client.async_client.chat.completions.create(
diff --git a/test_telemetry_cleanup_fix.py b/test_telemetry_cleanup_fix.py
@@ -0,0 +1,59 @@
+#!/usr/bin/env python3
+"""
+Test to verify telemetry cleanup fixes work correctly.
+This test checks that agents terminate properly without hanging after our cleanup fixes.
+"""
+
+import threading
+import time
+from praisonaiagents import Agent
+
+def test_telemetry_cleanup():
+    """Test that telemetry cleanup works correctly and agents don't hang."""
+    
+    # Record initial thread count
+    initial_threads = threading.active_count()
+    print(f"Initial thread count: {initial_threads}")
+    
+    # Create agent
+    agent = Agent(
+        name="TestAgent",
+        role="Test Agent",
+        goal="Test telemetry cleanup",
+        instructions="Return a simple response"
+    )
+    
+    # Test regular chat completion
+    print("Testing regular chat completion...")
+    response = agent.chat("Hello", stream=False)
+    print(f"Response: {response}")
+    
+    # Test with self-reflection (to test the paths we fixed)
+    print("\nTesting self-reflection path...")
+    agent.self_reflect = True
+    agent.min_reflect = 1
+    agent.max_reflect = 1
+    response = agent.chat("What is 2+2?", stream=False)
+    print(f"Reflection response: {response}")
+    
+    # Wait a moment for cleanup
+    time.sleep(2)
+    
+    # Check final thread count
+    final_threads = threading.active_count()
+    print(f"Final thread count: {final_threads}")
+    
+    # Check if thread count is reasonable (some background threads may remain)
+    if final_threads <= initial_threads + 5:  # Allow for some background threads
+        print("✅ Telemetry cleanup appears to be working correctly")
+        return True
+    else:
+        print(f"❌ Possible thread leak detected: {final_threads - initial_threads} extra threads")
+        return False
+
+if __name__ == "__main__":
+    success = test_telemetry_cleanup()
+    if success:
+        print("\n✅ All tests passed - telemetry cleanup is working correctly!")
+    else:
+        print("\n❌ Tests failed - there may be remaining telemetry cleanup issues")
diff --git a/test_telemetry_cleanup_simple.py b/test_telemetry_cleanup_simple.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""
+Simple test to verify telemetry cleanup fixes work correctly.
+This test focuses on the cleanup functionality without requiring OpenAI API calls.
+"""
+
+import os
+import threading
+import time
+import sys
+import logging
+
+# Set up logging to see debug info
+logging.basicConfig(level=logging.DEBUG)
+
+# Set a fake API key to avoid errors
+os.environ['OPENAI_API_KEY'] = 'test-key-for-cleanup-test'
+os.environ['OPENAI_API_BASE'] = 'http://localhost:1234/v1'
+
+def test_telemetry_cleanup_direct():
+    """Test that telemetry cleanup works correctly by directly testing the cleanup functions."""
+    
+    # Record initial thread count
+    initial_threads = threading.active_count()
+    print(f"Initial thread count: {initial_threads}")
+    
+    # Test the telemetry cleanup function directly
+    try:
+        from praisonaiagents.telemetry.telemetry import get_telemetry, force_shutdown_telemetry
+        
+        # Get a telemetry instance
+        telemetry = get_telemetry()
+        print(f"Telemetry enabled: {telemetry.enabled}")
+        
+        # Check if PostHog is initialized
+        if hasattr(telemetry, '_posthog') and telemetry._posthog:
+            print("PostHog client initialized")
+        else:
+            print("PostHog client not initialized")
+        
+        # Test cleanup
+        print("Testing force_shutdown_telemetry()...")
+        force_shutdown_telemetry()
+        
+        # Wait a moment for cleanup
+        time.sleep(1)
+        
+        # Check final thread count
+        final_threads = threading.active_count()
+        print(f"Final thread count: {final_threads}")
+        
+        # List remaining threads
+        remaining_threads = threading.enumerate()
+        print(f"Remaining threads: {[t.name for t in remaining_threads]}")
+        
+        # Check if cleanup was successful
+        if final_threads <= initial_threads + 2:  # Allow for some background threads
+            print("✅ Telemetry cleanup appears to be working correctly")
+            return True
+        else:
+            print(f"❌ Possible thread leak detected: {final_threads - initial_threads} extra threads")
+            return False
+            
+    except Exception as e:
+        print(f"❌ Error during telemetry cleanup test: {e}")
+        return False
+
+def test_agent_cleanup_method():
+    """Test that the agent cleanup method works correctly."""
+    
+    try:
+        from praisonaiagents import Agent
+        
+        # Create agent
+        agent = Agent(
+            name="TestAgent",
+            role="Test Agent",
+            goal="Test telemetry cleanup",
+            instructions="Return a simple response"
+        )
+        
+        # Test the cleanup method directly
+        print("Testing agent._cleanup_telemetry()...")
+        agent._cleanup_telemetry()
+        
+        print("✅ Agent cleanup method executed successfully")
+        return True
+        
+    except Exception as e:
+        print(f"❌ Error during agent cleanup test: {e}")
+        return False
+
+if __name__ == "__main__":
+    print("Testing telemetry cleanup fixes...")
+    
+    # Test 1: Direct telemetry cleanup
+    print("\n=== Test 1: Direct telemetry cleanup ===")
+    test1_success = test_telemetry_cleanup_direct()
+    
+    # Test 2: Agent cleanup method
+    print("\n=== Test 2: Agent cleanup method ===")
+    test2_success = test_agent_cleanup_method()
+    
+    # Final result
+    if test1_success and test2_success:
+        print("\n✅ All tests passed - telemetry cleanup is working correctly!")
+        sys.exit(0)
+    else:
+        print("\n❌ Some tests failed - there may be remaining telemetry cleanup issues")
+        sys.exit(1)
diff --git a/test_thread_cleanup.py b/test_thread_cleanup.py
@@ -0,0 +1,110 @@
+#!/usr/bin/env python3
+"""
+Test to specifically check for telemetry thread cleanup and prevent hanging.
+This test simulates the scenario where telemetry threads could cause hanging.
+"""
+
+import threading
+import time
+import sys
+import os
+
+# Set a fake API key to avoid errors
+os.environ['OPENAI_API_KEY'] = 'test-key-for-cleanup-test'
+os.environ['OPENAI_API_BASE'] = 'http://localhost:1234/v1'
+
+def test_thread_cleanup():
+    """Test that no telemetry threads remain after cleanup."""
+    
+    print(f"Initial thread count: {threading.active_count()}")
+    initial_threads = set(threading.enumerate())
+    
+    # Import and use telemetry
+    from praisonaiagents.telemetry.telemetry import get_telemetry, force_shutdown_telemetry
+    
+    # Get telemetry instance (this might start background threads)
+    telemetry = get_telemetry()
+    
+    # Track some events to potentially start background threads
+    telemetry.track_agent_execution("test_agent", success=True)
+    telemetry.track_tool_usage("test_tool", success=True)
+    telemetry.flush()
+    
+    # Wait a moment for threads to start
+    time.sleep(0.5)
+    
+    after_telemetry_threads = set(threading.enumerate())
+    new_threads = after_telemetry_threads - initial_threads
+    
+    print(f"After telemetry initialization: {threading.active_count()} threads")
+    if new_threads:
+        print(f"New threads created: {[t.name for t in new_threads]}")
+    
+    # Now force cleanup
+    print("Forcing telemetry cleanup...")
+    force_shutdown_telemetry()
+    
+    # Wait for cleanup to complete
+    time.sleep(1)
+    
+    final_threads = set(threading.enumerate())
+    remaining_new_threads = final_threads - initial_threads
+    
+    print(f"Final thread count: {threading.active_count()}")
+    print(f"Final threads: {[t.name for t in final_threads]}")
+    
+    if remaining_new_threads:
+        print(f"❌ Threads still remaining after cleanup: {[t.name for t in remaining_new_threads]}")
+        return False
+    else:
+        print("✅ All telemetry threads cleaned up successfully")
+        return True
+
+def test_agent_cleanup():
+    """Test that agent cleanup works properly."""
+    
+    from praisonaiagents import Agent
+    
+    initial_threads = threading.active_count()
+    print(f"Initial thread count before agent: {initial_threads}")
+    
+    # Create agent
+    agent = Agent(
+        name="TestAgent",
+        role="Test Agent", 
+        goal="Test cleanup",
+        instructions="Test"
+    )
+    
+    after_agent_threads = threading.active_count()
+    print(f"Thread count after agent creation: {after_agent_threads}")
+    
+    # Force cleanup
+    agent._cleanup_telemetry()
+    
+    # Wait for cleanup
+    time.sleep(0.5)
+    
+    final_threads = threading.active_count()
+    print(f"Final thread count after agent cleanup: {final_threads}")
+    
+    if final_threads <= initial_threads + 1:  # Allow for some variance
+        print("✅ Agent cleanup successful")
+        return True
+    else:
+        print(f"❌ Agent cleanup may have left threads: {final_threads - initial_threads} extra")
+        return False
+
+if __name__ == "__main__":
+    print("Testing thread cleanup to prevent hanging...")
+    
+    test1 = test_thread_cleanup()
+    print()
+    test2 = test_agent_cleanup()
+    
+    if test1 and test2:
+        print("\n✅ All thread cleanup tests passed!")
+        sys.exit(0)
+    else:
+        print("\n❌ Some thread cleanup tests failed!")
+        sys.exit(1)