From f9499892181d068bee6700583920278f8f66187a Mon Sep 17 00:00:00 2001
From: Li Yin
Date: Tue, 2 Jul 2024 20:47:16 -0700
Subject: [PATCH] complete first version parser
---
developer_notes/parser_note.py | 46 +++++
.../source/developer_notes/output_parsers.rst | 160 +++++++++++++++++-
.../components/output_parsers/outputs.py | 28 +--
3 files changed, 223 insertions(+), 11 deletions(-)
diff --git a/developer_notes/parser_note.py b/developer_notes/parser_note.py
index 56b8d3bb6..6e3a2a7f2 100644
--- a/developer_notes/parser_note.py
+++ b/developer_notes/parser_note.py
@@ -227,6 +227,50 @@ def yaml_parser():
print(parser(yaml_list_str))
+def json_output_parser():
+ from dataclasses import dataclass, field
+ from lightrag.components.output_parsers import JsonOutputParser
+ from lightrag.core import DataClass
+
+ @dataclass
+ class User(DataClass):
+ id: int = field(default=1, metadata={"description": "User ID"})
+ name: str = field(default="John", metadata={"description": "User name"})
+
+ user_example = User(id=1, name="John")
+
+ user_to_parse = '{"id": 2, "name": "Jane"}'
+
+ parser = JsonOutputParser(data_class=User, examples=[user_example])
+ print(parser)
+ output_format_str = parser.format_instructions()
+ print(output_format_str)
+ parsed_user = parser(user_to_parse)
+ print(parsed_user)
+
+
+def yaml_output_parser():
+ from dataclasses import dataclass, field
+ from lightrag.components.output_parsers import YamlOutputParser
+ from lightrag.core import DataClass
+
+ @dataclass
+ class User(DataClass):
+ id: int = field(default=1, metadata={"description": "User ID"})
+ name: str = field(default="John", metadata={"description": "User name"})
+
+ user_example = User(id=1, name="John")
+
+ user_to_parse = "id: 2\nname: Jane"
+
+ parser = YamlOutputParser(data_class=User, examples=[user_example])
+ print(parser)
+ output_format_str = parser.format_instructions()
+ print(output_format_str)
+ parsed_user = parser(user_to_parse)
+ print(parsed_user)
+
+
if __name__ == "__main__":
examples_of_different_ways_to_parse_string()
int_parser()
@@ -235,3 +279,5 @@ def yaml_parser():
list_parser()
json_parser()
yaml_parser()
+ json_output_parser()
+ yaml_output_parser()
diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst
index 72afcc1f2..25ddc738d 100644
--- a/docs/source/developer_notes/output_parsers.rst
+++ b/docs/source/developer_notes/output_parsers.rst
@@ -141,7 +141,7 @@ Thus, ``JsonOutputParser`` and ``YamlOutputParser`` both takes the following arg
- ``examples``: the examples of the data class instance if you want to show the examples in the prompt.
- ``exclude``: the fields to exclude from both the data format and the examples.
-.. TODO: a summary table
+.. TODO: a summary table and a diagram
Parser in Action
------------------
@@ -327,6 +327,164 @@ The output will be:
Output Parsers in Action
--------------------------
+
+We will create the following simple ``DataClass`` with one example.
+And we will demonstrate how to use ``JsonOutputParser`` and ``YamlOutputParser`` to parse another example to dict object.
+
+.. code-block:: python
+
+ from dataclasses import dataclass, field
+ from lightrag.core import DataClass
+
+ @dataclass
+ class User(DataClass):
+ id: int = field(default=1, metadata={"description": "User ID"})
+ name: str = field(default="John", metadata={"description": "User name"})
+
+ user_example = User(id=1, name="John")
+
+**JsonOutputParser**
+
+Here is how to use ``JsonOutputParser``:
+
+.. code-block:: python
+
+ from lightrag.components.output_parsers import JsonOutputParser
+
+ parser = JsonOutputParser(data_class=User, examples=[user_example])
+ print(parser)
+
+The structure of it:
+
+.. code-block::
+
+ JsonOutputParser(
+ data_class=User, examples=[json_output_parser..User(id=1, name='John')], exclude_fields=None
+ (json_output_format_prompt): Prompt(
+ template: Your output should be formatted as a standard JSON instance with the following schema:
+ ```
+ {{schema}}
+ ```
+ {% if example %}
+ Examples:
+ ```
+ {{example}}
+ ```
+ {% endif %}
+ -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!
+ -Use double quotes for the keys and string values.
+ -Follow the JSON formatting conventions., prompt_variables: ['example', 'schema']
+ )
+ (output_processors): JsonParser()
+ )
+
+The output format string will be:
+
+.. code-block::
+
+ Your output should be formatted as a standard JSON instance with the following schema:
+ ```
+ {
+ "id": " (int) (optional)",
+ "name": " (str) (optional)"
+ }
+ ```
+ Examples:
+ ```
+ {
+ "id": 1,
+ "name": "John"
+ }
+ ________
+ ```
+ -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!
+ -Use double quotes for the keys and string values.
+ -Follow the JSON formatting conventions.
+
+Call the parser with the following string:
+
+.. code-block:: python
+
+ user_to_parse = '{"id": 2, "name": "Jane"}'
+ parsed_user = parser(user_to_parse)
+ print(parsed_user)
+
+The output will be:
+
+.. code-block:: python
+
+ {'id': 2, 'name': 'Jane'}
+
+**YamlOutputParser**
+
+The steps are totally the same as the ``JsonOutputParser``.
+
+.. code-block:: python
+
+ from lightrag.components.output_parsers import YamlOutputParser
+
+ parser = YamlOutputParser(data_class=User, examples=[user_example])
+ print(parser)
+
+The structure of it:
+
+.. code-block::
+
+ YamlOutputParser(
+ data_class=.User'>, examples=[yaml_output_parser..User(id=1, name='John')]
+ (yaml_output_format_prompt): Prompt(
+ template: Your output should be formatted as a standard YAML instance with the following schema:
+ ```
+ {{schema}}
+ ```
+ {% if example %}
+ Examples:
+ ```
+ {{example}}
+ ```
+ {% endif %}
+
+ -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!
+ -Follow the YAML formatting conventions with an indent of 2 spaces.
+ -Quote the string values properly., prompt_variables: ['schema', 'example']
+ )
+ (output_processors): YamlParser()
+ )
+
+The output format string will be:
+
+.. code-block::
+
+ Your output should be formatted as a standard YAML instance with the following schema:
+ ```
+ id: (int) (optional)
+ name: (str) (optional)
+ ```
+ Examples:
+ ```
+ id: 1
+ name: John
+
+ ________
+ ```
+
+ -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output!
+ -Follow the YAML formatting conventions with an indent of 2 spaces.
+ -Quote the string values properly.
+
+Now, let us parse the following string:
+
+.. code-block:: python
+
+ user_to_parse = "id: 2\nname: Jane"
+ parsed_user = parser(user_to_parse)
+ print(parsed_user)
+
+The output will be:
+
+.. code-block:: python
+
+ {'id': 2, 'name': 'Jane'}
.. # todo
.. Evaluate Format following
.. --------------------------
diff --git a/lightrag/lightrag/components/output_parsers/outputs.py b/lightrag/lightrag/components/output_parsers/outputs.py
index 337c85709..211835e92 100644
--- a/lightrag/lightrag/components/output_parsers/outputs.py
+++ b/lightrag/lightrag/components/output_parsers/outputs.py
@@ -51,10 +51,7 @@
-Quote the string values properly."""
LIST_OUTPUT_FORMAT = r"""Your output should be formatted as a standard Python list.
--Each element can be of any Python data type such as string, integer, float, list, dictionary, etc.
--You can also have nested lists and dictionaries.
--Please do not add anything other than valid Python list output!
-"""
+- Start the list with '[' and end with ']'"""
YAML_OUTPUT_PARSER_OUTPUT_TYPE = Dict[str, Any]
@@ -139,13 +136,18 @@ def __init__(
if not is_dataclass(data_class):
raise ValueError(f"Provided class is not a dataclass: {data_class}")
+ if not issubclass(data_class, DataClass):
+ raise ValueError(
+ f"Provided class is not a subclass of DataClass: {data_class}"
+ )
+
# ensure example is instance of data class and initiated
if examples is not None and not isinstance(examples[0], data_class):
raise ValueError(
f"Provided example is not an instance of the data class: {data_class}"
)
self._exclude_fields = exclude_fields
- self.data_class_for_yaml: DataClass = data_class
+ self.data_class: DataClass = data_class
self.yaml_output_format_prompt = Prompt(template=YAML_OUTPUT_FORMAT)
self.output_processors = YamlParser()
self.examples = examples
@@ -163,7 +165,7 @@ def format_instructions(
exclude (List[str], optional): The fields to exclude from the schema of the data class.
"""
format_type = format_type or DataClassFormatType.SIGNATURE_YAML
- schema = self.data_class_for_yaml.format_class_str(
+ schema = self.data_class.format_class_str(
format_type=format_type, exclude=self._exclude_fields
)
# convert example to string, convert data class to yaml string
@@ -189,7 +191,7 @@ def call(self, input: str) -> YAML_OUTPUT_PARSER_OUTPUT_TYPE:
return self.output_processors(input)
def _extra_repr(self) -> str:
- s = f"data_class_for_yaml={self.data_class_for_yaml}, examples={self.examples}"
+ s = f"data_class={self.data_class}, examples={self.examples}"
return s
@@ -204,13 +206,18 @@ def __init__(
if not is_dataclass(data_class):
raise ValueError(f"Provided class is not a dataclass: {data_class}")
+ if not issubclass(data_class, DataClass):
+ raise ValueError(
+ f"Provided class is not a subclass of DataClass: {data_class}"
+ )
+
if examples is not None and not isinstance(examples[0], data_class):
raise ValueError(
f"Provided example is not an instance of the data class: {data_class}"
)
self._exclude_fields = exclude_fields
template = JSON_OUTPUT_FORMAT
- self.data_class_for_json: DataClass = data_class
+ self.data_class: DataClass = data_class
self.json_output_format_prompt = Prompt(template=template)
self.output_processors = JsonParser()
self.examples = examples
@@ -228,7 +235,7 @@ def format_instructions(
Options: DataClassFormatType.SIGNATURE_YAML, DataClassFormatType.SIGNATURE_JSON, DataClassFormatType.SCHEMA.
"""
format_type = format_type or DataClassFormatType.SIGNATURE_JSON
- schema = self.data_class_for_json.format_class_str(
+ schema = self.data_class.format_class_str(
format_type=format_type, exclude=self._exclude_fields
)
example_str = ""
@@ -244,6 +251,7 @@ def format_instructions(
log.debug(f"{__class__.__name__} example_str: {example_str}")
except Exception:
+ log.error(f"Error in formatting example for {__class__.__name__}")
example_str = None
return self.json_output_format_prompt(schema=schema, example=example_str)
@@ -251,7 +259,7 @@ def call(self, input: str) -> Any:
return self.output_processors(input)
def _extra_repr(self) -> str:
- s = f"data_class_for_json={self.data_class_for_json}, examples={self.examples}, exclude_fields={self._exclude_fields}"
+ s = f"""data_class={self.data_class.__name__}, examples={self.examples}, exclude_fields={self._exclude_fields}"""
return s