pickwicksoft · garlontas · Jun 6, 2025 · sourcery-ai · Jun 6, 2025 · sourcery-ai
diff --git a/pystreamapi/loaders/__csv/__csv_loader.py b/pystreamapi/loaders/__csv/__csv_loader.py
@@ -1,41 +1,62 @@
 from collections import namedtuple
 from csv import reader
+from io import StringIO
+from typing import Any, Iterator
 
 from pystreamapi.loaders.__loader_utils import LoaderUtils
-from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
 
 
-def csv(file_path: str, cast_types=True, delimiter=',', encoding="utf-8") -> LazyFileIterable:
+def csv(
+        src: str, read_from_src=False, cast_types=True, delimiter=',', encoding="utf-8"
+) -> Iterator[Any]:
     """
-    Loads a CSV file and converts it into a list of namedtuples.
-
-    Returns:
-        list: A list of namedtuples, where each namedtuple represents a row in the CSV.
-        :param cast_types: Set as False to disable casting of values to int, bool or float.
-        :param encoding: The encoding of the CSV file.
-        :param file_path: The path to the CSV file.
-        :param delimiter: The delimiter used in the CSV file.
+    Lazily loads CSV data from either a path or a string and yields namedtuples.
+
+    Args:
+        src (str): Either the path to a CSV file or a CSV string.
+        read_from_src (bool): If True, src is treated as a CSV string.
+        If False, src is treated as a path to a CSV file.
+        cast_types (bool): Set as False to disable casting of values to int, bool or float.
+        delimiter (str): The delimiter used in the CSV data.
+        encoding (str): The encoding of the CSV file (only used when reading from file).
+
+    Yields:
+        namedtuple: Each row in the CSV as a namedtuple.
     """
-    file_path = LoaderUtils.validate_path(file_path)
-    return LazyFileIterable(lambda: __load_csv(file_path, cast_types, delimiter, encoding))
+    if not read_from_src:
+        src = LoaderUtils.validate_path(src)
+        return __load_csv_from_file(src, cast_types, delimiter, encoding)
+    return __load_csv_from_string(src, cast_types, delimiter)
 
 
-def __load_csv(file_path, cast, delimiter, encoding):
-    """Load a CSV file and convert it into a list of namedtuples"""
+def __load_csv_from_file(file_path, cast, delimiter, encoding):
+    """Load a CSV file and convert it into a generator of namedtuples"""
     # skipcq: PTC-W6004
     with open(file_path, mode='r', newline='', encoding=encoding) as csvfile:
-        csvreader = reader(csvfile, delimiter=delimiter)
+        yield from __process_csv(csvfile, cast, delimiter)
+
+
+def __load_csv_from_string(csv_string, cast, delimiter):
+    """Load a CSV from string and convert it into a generator of namedtuples"""
+    with StringIO(csv_string) as csvfile:
+        yield from __process_csv(csvfile, cast, delimiter)
+
 
-        # Create a namedtuple type, casting the header values to int or float if possible
-        header = __get_csv_header(csvreader)
+def __process_csv(csvfile, cast, delimiter):
+    """Process CSV data and yield namedtuples"""
+    csvreader = reader(csvfile, delimiter=delimiter)
 
-        Row = namedtuple('Row', list(header))
+    # Create a namedtuple type, casting the header values to int or float if possible
+    header = __get_csv_header(csvreader)
+    if not header:
+        return
 
-        mapper = LoaderUtils.try_cast if cast else lambda x: x
+    Row = namedtuple('Row', list(header))
+    mapper = LoaderUtils.try_cast if cast else lambda x: x
 
-        # Process the data, casting values to int or float if possible
-        data = [Row(*[mapper(value) for value in row]) for row in csvreader]
-    return data
+    # Yield the data row by row, casting values to int or float if possible
+    for row in csvreader:
+        yield Row(*[mapper(value) for value in row])
 
 
 def __get_csv_header(csvreader):

diff --git a/pystreamapi/loaders/__json/__json_loader.py b/pystreamapi/loaders/__json/__json_loader.py
@@ -1,40 +1,51 @@
 import json as jsonlib
 from collections import namedtuple
+from typing import Any, Iterator
 
-from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
 from pystreamapi.loaders.__loader_utils import LoaderUtils
 
 
-def json(src: str, read_from_src=False) -> LazyFileIterable:
+def json(src: str, read_from_src=False) -> Iterator[Any]:
     """
-    Loads JSON data from either a path or a string and converts it into a list of namedtuples.
+    Lazily loads JSON data from either a path or a string and yields namedtuples.
 
-    Returns:
-        list: A list of namedtuples, where each namedtuple represents an object in the JSON.
-        :param src: Either the path to a JSON file or a JSON string.
-        :param read_from_src: If True, src is treated as a JSON string. If False, src is treated as
-        a path to a JSON file.
+    Args:
+        src (str): Either the path to a JSON file or a JSON string.
+        read_from_src (bool): If True, src is treated as a JSON string.
+        If False, src is treated as a path to a JSON file.
+
+    Yields:
+        namedtuple: Each object in the JSON as a namedtuple.
     """
     if read_from_src:
-        return LazyFileIterable(lambda: __load_json_string(src))
+        return __lazy_load_json_string(src)
     path = LoaderUtils.validate_path(src)
-    return LazyFileIterable(lambda: __load_json_file(path))
+    return __lazy_load_json_file(path)
+
+
+def __lazy_load_json_file(file_path: str) -> Iterator[Any]:
+    """Lazily read and parse a JSON file, yielding namedtuples."""
+
+    def generator():
+        # skipcq: PTC-W6004
+        with open(file_path, mode='r', encoding='utf-8') as jsonfile:
+            src = jsonfile.read()
+            if src == '':
+                return
-            if src == '':
-                return
+            if not src.strip():
+                return
-            if src == '':
-                return
+            if not src.strip():
+                return
+            yield from jsonlib.loads(src, object_hook=__dict_to_namedtuple)
+
+    return generator()
 
 
-def __load_json_file(file_path):
-    """Load a JSON file and convert it into a list of namedtuples"""
-    # skipcq: PTC-W6004
-    with open(file_path, mode='r', encoding='utf-8') as jsonfile:
-        src = jsonfile.read()
-        if src == '':
-            return []
-        data = jsonlib.loads(src, object_hook=__dict_to_namedtuple)
-    return data
+def __lazy_load_json_string(json_string: str) -> Iterator[Any]:
+    """Lazily parse a JSON string, yielding namedtuples."""
 
+    def generator():
+        if not json_string.strip():
+            return
+        yield from jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)
 
-def __load_json_string(json_string):
-    """Load JSON data from a string and convert it into a list of namedtuples"""
-    return jsonlib.loads(json_string, object_hook=__dict_to_namedtuple)
+    return generator()
 
 
 def __dict_to_namedtuple(d, name='Item'):

diff --git a/pystreamapi/loaders/__xml/__xml_loader.py b/pystreamapi/loaders/__xml/__xml_loader.py
@@ -1,11 +1,12 @@
+from typing import Iterator, Any
+
 try:
     from defusedxml import ElementTree
 except ImportError as exc:
     raise ImportError(
         "Please install the xml_loader extra dependency to use the xml loader."
     ) from exc
 from collections import namedtuple
-from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
 from pystreamapi.loaders.__loader_utils import LoaderUtils
 
 
@@ -21,14 +22,14 @@ def __init__(self):
 
 
 def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
-        encoding="utf-8") -> LazyFileIterable:
+        encoding="utf-8") -> Iterator[Any]:
     """
     Loads XML data from either a path or a string and converts it into a list of namedtuples.
     Warning: This method isn't safe against malicious XML trees. Parse only safe XML from sources
     you trust.
 
     Returns:
-        LazyFileIterable: A list of namedtuples, where each namedtuple represents an XML element.
+        An iterator with namedtuples, where each namedtuple represents an XML element.
         :param retrieve_children: If true, the children of the root element are used as stream
         elements.
         :param encoding: The encoding of the XML file.
@@ -39,32 +40,37 @@ def xml(src: str, read_from_src=False, retrieve_children=True, cast_types=True,
     """
     config.cast_types = cast_types
     config.retrieve_children = retrieve_children
+
     if read_from_src:
-        return LazyFileIterable(lambda: __load_xml_string(src))
+        return _lazy_parse_xml_string(src)
+
     path = LoaderUtils.validate_path(src)
-    return LazyFileIterable(lambda: __load_xml_file(path, encoding))
+    return _lazy_parse_xml_file(path, encoding)
+
+
+def _lazy_parse_xml_file(file_path: str, encoding: str) -> Iterator[Any]:
+    def generator():
+        with open(file_path, mode='r', encoding=encoding) as xmlfile:
+            xml_string = xmlfile.read()
+            yield from _parse_xml_string_lazy(xml_string)
-def _lazy_parse_xml_file(file_path: str, encoding: str) -> Iterator[Any]:
-    def generator():
-        with open(file_path, mode='r', encoding=encoding) as xmlfile:
-            xml_string = xmlfile.read()
-            yield from _parse_xml_string_lazy(xml_string)
+def _lazy_parse_xml_file(file_path: str, encoding: str) -> Iterator[Any]:
+    # Use ElementTree.iterparse for efficient streaming parsing
+    # Note: ElementTree does not support encoding argument in iterparse, so we open the file accordingly
+    import io
+    with open(file_path, mode='r', encoding=encoding) as xmlfile:
+        # iterparse expects a file-like object opened in text mode for XML
+        context = ElementTree.iterparse(xmlfile, events=("end",))
+        for event, elem in context:
+            yield elem
+            # Optionally clear the element to free memory
+            elem.clear()
-def _lazy_parse_xml_file(file_path: str, encoding: str) -> Iterator[Any]:
-    def generator():
-        with open(file_path, mode='r', encoding=encoding) as xmlfile:
-            xml_string = xmlfile.read()
-            yield from _parse_xml_string_lazy(xml_string)
+def _lazy_parse_xml_file(file_path: str, encoding: str) -> Iterator[Any]:
+    # Use ElementTree.iterparse for efficient streaming parsing
+    # Note: ElementTree does not support encoding argument in iterparse, so we open the file accordingly
+    import io
+    with open(file_path, mode='r', encoding=encoding) as xmlfile:
+        # iterparse expects a file-like object opened in text mode for XML
+        context = ElementTree.iterparse(xmlfile, events=("end",))
+        for event, elem in context:
+            yield elem
+            # Optionally clear the element to free memory
+            elem.clear()
 
+    return generator()
 
-def __load_xml_file(file_path, encoding):
-    """Load an XML file and convert it into a list of namedtuples."""
-    # skipcq: PTC-W6004
-    with open(file_path, mode='r', encoding=encoding) as xmlfile:
-        src = xmlfile.read()
-        if src:
-            return __parse_xml_string(src)
-    return []
 
+def _lazy_parse_xml_string(xml_string: str) -> Iterator[Any]:
+    def generator():
+        yield from _parse_xml_string_lazy(xml_string)
 
-def __load_xml_string(xml_string):
-    """Load XML data from a string and convert it into a list of namedtuples."""
-    return __parse_xml_string(xml_string)
+    return generator()
 
 
-def __parse_xml_string(xml_string):
-    """Parse XML string and convert it into a list of namedtuples."""
+def _parse_xml_string_lazy(xml_string: str) -> Iterator[Any]:
     root = ElementTree.fromstring(xml_string)
-    parsed_xml = __parse_xml(root)
-    return __flatten(parsed_xml) if config.retrieve_children else [parsed_xml]
+    parsed = __parse_xml(root)
+    if config.retrieve_children:
+        yield from __flatten(parsed)
+    else:
+        yield parsed
 
 
 def __parse_xml(element):
@@ -107,11 +113,9 @@ def __filter_single_items(tag_dict):
 
 
 def __flatten(data):
-    """Flatten a list of lists."""
-    res = []
+    """Yield flattened elements from a possibly nested structure."""
     for item in data:
         if isinstance(item, list):
-            res.extend(item)
+            yield from item
         else:
-            res.append(item)
-    return res
+            yield item
diff --git a/pystreamapi/loaders/__yaml/__yaml_loader.py b/pystreamapi/loaders/__yaml/__yaml_loader.py
@@ -1,3 +1,5 @@
+from typing import Any, Iterator
+
 try:
     import yaml as yaml_lib
 except ImportError as exc:
@@ -6,11 +8,10 @@
     ) from exc
 from collections import namedtuple
 
-from pystreamapi.loaders.__lazy_file_iterable import LazyFileIterable
 from pystreamapi.loaders.__loader_utils import LoaderUtils
 
 
-def yaml(src: str, read_from_src=False) -> LazyFileIterable:
+def yaml(src: str, read_from_src=False) -> Iterator[Any]:
     """
     Loads YAML data from either a path or a string and converts it into a list of namedtuples.
 
@@ -23,26 +24,26 @@ def yaml(src: str, read_from_src=False) -> LazyFileIterable:
         list: A list of namedtuples, where each namedtuple represents an object in the YAML.
     """
     if read_from_src:
-        return LazyFileIterable(lambda: __load_yaml_string(src))
+        return __load_yaml_string(src)
     path = LoaderUtils.validate_path(src)
-    return LazyFileIterable(lambda: __load_yaml_file(path))
+    return __load_yaml_file(path)
 
 
 def __load_yaml_file(file_path):
     """Load a YAML file and convert it into a list of namedtuples"""
     # skipcq: PTC-W6004
-    with open(file_path, mode='r', encoding='utf-8') as yamlfile:
-        src = yamlfile.read()
-        if src == '':
-            return []
-        data = yaml_lib.safe_load(src)
-        return __convert_to_namedtuples(data)
+    with open(file_path, 'r', encoding='utf-8') as yamlfile:
+        # Supports both single and multiple documents
+        for document in yaml_lib.safe_load_all(yamlfile):
+            if document:
+                yield from __convert_to_namedtuples(document)
 
 
 def __load_yaml_string(yaml_string):
     """Load YAML data from a string and convert it into a list of namedtuples"""
-    data = yaml_lib.safe_load(yaml_string)
-    return [] if data is None else __convert_to_namedtuples(data)
+    for document in yaml_lib.safe_load_all(yaml_string):
+        if document:
+            yield from __convert_to_namedtuples(document)
 
 
 def __convert_to_namedtuples(data, name='Item'):