Add gzip support for load_ndjson() (#207)

FindHao · meta-codesync[bot] · commit 24e002eb7724 · 2025-12-02T20:56:59.000-08:00
Summary: This PR adds support for loading gzip-compressed NDJSON files in the `load_ndjson()` function, fixing an issue where the CLI claimed to support `.ndjson.gz` files but the function only used `open()`. ## Supported Formats | Format | Extension | Description | |--------|-----------|-------------| | Uncompressed | `.ndjson` | Standard NDJSON (existing) | | Gzip compressed | `.ndjson.gz` | Whole file compressed | | Gzip member concatenation | `.bin.ndjson` | Each line compressed separately | ## Changes - **`tools/prettify_ndjson.py`**: - Added `import gzip` - Added `_is_gzip_file()` helper function to detect compressed files - Modified `load_ndjson()` to use `gzip.open()` for compressed files - Updated docstring to document supported formats - **`tests/test_tritonparse.py`**: - Added `test_load_ndjson_gzip_support()` test using existing `.ndjson.gz` test file ## Testing Uses existing test file: `tests/example_output/parsed_output_complex/dedicated_log_triton_trace_findhao__mapped.ndjson.gz` Pull Request resolved: #207 Reviewed By: wychi Differential Revision: D88171069 Pulled By: FindHao fbshipit-source-id: 701a238a3d9d34d1d096834088a4ac87cb16ed09
diff --git a/tests/test_tritonparse.py b/tests/test_tritonparse.py
@@ -307,6 +307,35 @@ def test_loc_alias_parsing(self):
 
         print("✓ All loc alias parsing tests passed")
 
+    def test_load_ndjson_gzip_support(self):
+        """Test that load_ndjson can load .ndjson.gz files."""
+        from pathlib import Path
+
+        from tritonparse.tools.prettify_ndjson import load_ndjson
+
+        # Use existing .ndjson.gz test file
+        gz_file = (
+            Path(__file__).parent
+            / "example_output/parsed_output_complex/dedicated_log_triton_trace_findhao__mapped.ndjson.gz"
+        )
+
+        # Verify file exists
+        self.assertTrue(gz_file.exists(), f"Test file not found: {gz_file}")
+
+        # Load and verify
+        events = load_ndjson(gz_file)
+        self.assertIsInstance(events, list)
+        self.assertGreater(len(events), 0, "Should load at least one event")
+
+        # Verify we have expected event types
+        event_types = {e.get("event_type") for e in events if isinstance(e, dict)}
+        self.assertTrue(
+            "compilation" in event_types or "launch" in event_types,
+            f"Expected compilation or launch events, got: {event_types}",
+        )
+
+        print(f"✓ Successfully loaded {len(events)} events from .ndjson.gz file")
+
 
 class TestTritonparseCUDA(unittest.TestCase):
     """CUDA tests (require GPU)"""
diff --git a/tritonparse/tools/prettify_ndjson.py b/tritonparse/tools/prettify_ndjson.py
@@ -39,12 +39,19 @@
 """
 
 import argparse
+import gzip
 import json
 import sys
 from pathlib import Path
 from typing import Any, List, Union
 
 
+def _is_gzip_file(file_path: Path) -> bool:
+    """Check if file is gzip compressed (.gz or .bin.ndjson)."""
+    path_str = str(file_path)
+    return path_str.endswith(".gz") or path_str.endswith(".bin.ndjson")
+
+
 def parse_line_ranges(lines_arg: str) -> set[int]:
     """
     Parse line ranges from string like "1,2,3,5-10" into a set of line numbers.
@@ -106,6 +113,9 @@ def load_ndjson(
     """
     Load NDJSON file and return list of JSON objects.
 
+    Supports uncompressed (.ndjson), gzip compressed (.ndjson.gz),
+    and gzip member concatenation (.bin.ndjson) formats.
+
     Args:
         file_path: Path to the NDJSON file
         not_save_irs: Whether to NOT save file_content and python_source for compilation events
@@ -122,8 +132,13 @@ def load_ndjson(
     filtered_compilation_events = 0
     total_lines_processed = 0
 
+    # Determine if file is gzip compressed
+    is_compressed = _is_gzip_file(file_path)
+    opener = gzip.open if is_compressed else open
+    mode = "rt" if is_compressed else "r"
+
     try:
-        with open(file_path, "r", encoding="utf-8") as f:
+        with opener(file_path, mode, encoding="utf-8") as f:
             # enumerate(f, 1) starts line numbering from 1 (1-based indexing)
             for line_num, line in enumerate(f, 1):
                 line = line.strip()