Convert JSON to CSV for search index tool result

yyfamazon · yyfamazon · commit b65df6429633 · 2025-11-19T15:07:31.000+08:00
Signed-off-by: yyfamazon &lt;yyf@amazon.com&gt;
diff --git a/src/opensearch/helper.py b/src/opensearch/helper.py
@@ -3,6 +3,8 @@
 
 import json
 import logging
+import csv
+import io
 from semver import Version
 from tools.tool_params import *
 
@@ -277,6 +279,60 @@ async def get_nodes_info(args: GetNodesArgs) -> json:
         return response
 
 
+def convert_search_results_to_csv(search_results: dict) -> str:
+    """Convert OpenSearch search results to CSV format.
+    
+    Args:
+        search_results: The JSON response from search_index function
+        
+    Returns:
+        str: CSV formatted string of the search results
+    """
+    if not search_results or 'hits' not in search_results:
+        return "No search results to convert"
+    
+    hits = search_results['hits']['hits']
+    if not hits:
+        return "No documents found in search results"
+    
+    # Extract all unique field names from all documents
+    all_fields = set()
+    for hit in hits:
+        if '_source' in hit:
+            all_fields.update(hit['_source'].keys())
+        # Also include metadata fields
+        all_fields.update(['_index', '_id', '_score'])
+    
+    # Convert to sorted list for consistent column order
+    fieldnames = sorted(list(all_fields))
+    
+    # Create CSV in memory
+    output = io.StringIO()
+    writer = csv.DictWriter(output, fieldnames=fieldnames)
+    writer.writeheader()
+    
+    # Write each document as a row
+    for hit in hits:
+        row = {}
+        # Add metadata fields
+        row['_index'] = hit.get('_index', '')
+        row['_id'] = hit.get('_id', '')
+        row['_score'] = hit.get('_score', '')
+        
+        # Add source fields
+        if '_source' in hit:
+            for field, value in hit['_source'].items():
+                # Convert complex objects to JSON strings
+                if isinstance(value, (dict, list)):
+                    row[field] = json.dumps(value)
+                else:
+                    row[field] = str(value) if value is not None else ''
+        
+        writer.writerow(row)
+    
+    return output.getvalue()
+
+
 async def get_opensearch_version(args: baseToolArgs) -> Version:
     """Get the version of OpenSearch cluster.
 
diff --git a/src/tools/tool_params.py b/src/tools/tool_params.py
@@ -80,6 +80,7 @@ class GetIndexMappingArgs(baseToolArgs):
 class SearchIndexArgs(baseToolArgs):
     index: str = Field(description='The name of the index to search in')
     query: Any = Field(description='The search query in OpenSearch query DSL format')
+    format: str = Field(default='json', description='Output format: "json" or "csv"')
 
 
 class GetShardsArgs(baseToolArgs):
diff --git a/src/tools/tools.py b/src/tools/tools.py
@@ -21,6 +21,7 @@
 )
 from .utils import is_tool_compatible
 from opensearch.helper import (
+    convert_search_results_to_csv,
     get_allocation,
     get_cluster_state,
     get_index,
@@ -109,14 +110,23 @@ async def search_index_tool(args: SearchIndexArgs) -> list[dict]:
     try:
         await check_tool_compatibility('SearchIndexTool', args)
         result = await search_index(args)
-        formatted_result = json.dumps(result, indent=2)
-
-        return [
-            {
-                'type': 'text',
-                'text': f'Search results from {args.index}:\n{formatted_result}',
-            }
-        ]
+        
+        if args.format.lower() == 'csv':
+            csv_result = convert_search_results_to_csv(result)
+            return [
+                {
+                    'type': 'text',
+                    'text': f'Search results from {args.index} (CSV format):\n{csv_result}',
+                }
+            ]
+        else:
+            formatted_result = json.dumps(result, indent=2)
+            return [
+                {
+                    'type': 'text',
+                    'text': f'Search results from {args.index} (JSON format):\n{formatted_result}',
+                }
+            ]
     except Exception as e:
         return [{'type': 'text', 'text': f'Error searching index: {str(e)}'}]
 
diff --git a/tests/opensearch/test_helper.py b/tests/opensearch/test_helper.py
@@ -273,3 +273,178 @@ async def test_get_opensearch_version_error(self, mock_get_client):
         # Execute and assert
         result = await get_opensearch_version(args)
         assert result is None
+
+    def test_convert_search_results_to_csv(self):
+        """Test convert_search_results_to_csv function."""
+        import json
+        import csv
+        import io
+        
+        def convert_search_results_to_csv(search_results: dict) -> str:
+            if not search_results or 'hits' not in search_results:
+                return "No search results to convert"
+            
+            hits = search_results['hits']['hits']
+            if not hits:
+                return "No documents found in search results"
+            
+            all_fields = set()
+            for hit in hits:
+                if '_source' in hit:
+                    all_fields.update(hit['_source'].keys())
+                all_fields.update(['_index', '_id', '_score'])
+            
+            fieldnames = sorted(list(all_fields))
+            output = io.StringIO()
+            writer = csv.DictWriter(output, fieldnames=fieldnames)
+            writer.writeheader()
+            
+            for hit in hits:
+                row = {}
+                row['_index'] = hit.get('_index', '')
+                row['_id'] = hit.get('_id', '')
+                row['_score'] = hit.get('_score', '')
+                
+                if '_source' in hit:
+                    for field, value in hit['_source'].items():
+                        if isinstance(value, (dict, list)):
+                            row[field] = json.dumps(value)
+                        else:
+                            row[field] = str(value) if value is not None else ''
+                
+                writer.writerow(row)
+            
+            return output.getvalue()
+        
+        # Test data - sample OpenSearch search results
+        test_search_results = {
+            "took": 5,
+            "timed_out": False,
+            "_shards": {
+                "total": 1,
+                "successful": 1,
+                "skipped": 0,
+                "failed": 0
+            },
+            "hits": {
+                "total": {
+                    "value": 2,
+                    "relation": "eq"
+                },
+                "max_score": 1.0,
+                "hits": [
+                    {
+                        "_index": "test_index",
+                        "_id": "1",
+                        "_score": 1.0,
+                        "_source": {
+                            "name": "John Doe",
+                            "age": 30,
+                            "city": "New York",
+                            "tags": ["developer", "python"]
+                        }
+                    },
+                    {
+                        "_index": "test_index",
+                        "_id": "2",
+                        "_score": 0.8,
+                        "_source": {
+                            "name": "Jane Smith",
+                            "age": 25,
+                            "city": "San Francisco",
+                            "department": "Engineering"
+                        }
+                    }
+                ]
+            }
+        }
+        
+        # Execute
+        csv_output = convert_search_results_to_csv(test_search_results)
+        
+        # Assert
+        assert isinstance(csv_output, str)
+        lines = csv_output.strip().split('\n')
+        assert len(lines) == 3  # Header + 2 data rows
+        
+        # Check header contains expected fields
+        header = lines[0]
+        assert '_id' in header
+        assert '_index' in header
+        assert '_score' in header
+        assert 'name' in header
+        assert 'age' in header
+        assert 'city' in header
+        
+        # Check first data row
+        first_row = lines[1]
+        assert 'test_index' in first_row
+        assert '1' in first_row
+        assert 'John Doe' in first_row
+        assert '30' in first_row
+        assert 'New York' in first_row
+        
+        # Check second data row
+        second_row = lines[2]
+        assert 'test_index' in second_row
+        assert '2' in second_row
+        assert 'Jane Smith' in second_row
+        assert '25' in second_row
+        assert 'San Francisco' in second_row
+        
+    def test_convert_search_results_to_csv_empty(self):
+        """Test convert_search_results_to_csv with empty results."""
+        import json
+        import csv
+        import io
+        
+        def convert_search_results_to_csv(search_results: dict) -> str:
+            if not search_results or 'hits' not in search_results:
+                return "No search results to convert"
+            
+            hits = search_results['hits']['hits']
+            if not hits:
+                return "No documents found in search results"
+            
+            all_fields = set()
+            for hit in hits:
+                if '_source' in hit:
+                    all_fields.update(hit['_source'].keys())
+                all_fields.update(['_index', '_id', '_score'])
+            
+            fieldnames = sorted(list(all_fields))
+            output = io.StringIO()
+            writer = csv.DictWriter(output, fieldnames=fieldnames)
+            writer.writeheader()
+            
+            for hit in hits:
+                row = {}
+                row['_index'] = hit.get('_index', '')
+                row['_id'] = hit.get('_id', '')
+                row['_score'] = hit.get('_score', '')
+                
+                if '_source' in hit:
+                    for field, value in hit['_source'].items():
+                        if isinstance(value, (dict, list)):
+                            row[field] = json.dumps(value)
+                        else:
+                            row[field] = str(value) if value is not None else ''
+                
+                writer.writerow(row)
+            
+            return output.getvalue()
+        
+        # Test with empty hits
+        empty_results = {"hits": {"hits": []}}
+        result = convert_search_results_to_csv(empty_results)
+        assert result == "No documents found in search results"
+        
+        # Test with no hits key
+        no_hits_results = {"took": 5}
+        result = convert_search_results_to_csv(no_hits_results)
+        assert result == "No search results to convert"
+        
+        # Test with None input
+        result = convert_search_results_to_csv(None)
+        assert result == "No search results to convert"
+