From f9499892181d068bee6700583920278f8f66187a Mon Sep 17 00:00:00 2001 From: Li Yin Date: Tue, 2 Jul 2024 20:47:16 -0700 Subject: [PATCH] complete first version parser --- developer_notes/parser_note.py | 46 +++++ .../source/developer_notes/output_parsers.rst | 160 +++++++++++++++++- .../components/output_parsers/outputs.py | 28 +-- 3 files changed, 223 insertions(+), 11 deletions(-) diff --git a/developer_notes/parser_note.py b/developer_notes/parser_note.py index 56b8d3bb6..6e3a2a7f2 100644 --- a/developer_notes/parser_note.py +++ b/developer_notes/parser_note.py @@ -227,6 +227,50 @@ def yaml_parser(): print(parser(yaml_list_str)) +def json_output_parser(): + from dataclasses import dataclass, field + from lightrag.components.output_parsers import JsonOutputParser + from lightrag.core import DataClass + + @dataclass + class User(DataClass): + id: int = field(default=1, metadata={"description": "User ID"}) + name: str = field(default="John", metadata={"description": "User name"}) + + user_example = User(id=1, name="John") + + user_to_parse = '{"id": 2, "name": "Jane"}' + + parser = JsonOutputParser(data_class=User, examples=[user_example]) + print(parser) + output_format_str = parser.format_instructions() + print(output_format_str) + parsed_user = parser(user_to_parse) + print(parsed_user) + + +def yaml_output_parser(): + from dataclasses import dataclass, field + from lightrag.components.output_parsers import YamlOutputParser + from lightrag.core import DataClass + + @dataclass + class User(DataClass): + id: int = field(default=1, metadata={"description": "User ID"}) + name: str = field(default="John", metadata={"description": "User name"}) + + user_example = User(id=1, name="John") + + user_to_parse = "id: 2\nname: Jane" + + parser = YamlOutputParser(data_class=User, examples=[user_example]) + print(parser) + output_format_str = parser.format_instructions() + print(output_format_str) + parsed_user = parser(user_to_parse) + print(parsed_user) + + if __name__ == "__main__": examples_of_different_ways_to_parse_string() int_parser() @@ -235,3 +279,5 @@ def yaml_parser(): list_parser() json_parser() yaml_parser() + json_output_parser() + yaml_output_parser() diff --git a/docs/source/developer_notes/output_parsers.rst b/docs/source/developer_notes/output_parsers.rst index 72afcc1f2..25ddc738d 100644 --- a/docs/source/developer_notes/output_parsers.rst +++ b/docs/source/developer_notes/output_parsers.rst @@ -141,7 +141,7 @@ Thus, ``JsonOutputParser`` and ``YamlOutputParser`` both takes the following arg - ``examples``: the examples of the data class instance if you want to show the examples in the prompt. - ``exclude``: the fields to exclude from both the data format and the examples. -.. TODO: a summary table +.. TODO: a summary table and a diagram Parser in Action ------------------ @@ -327,6 +327,164 @@ The output will be: Output Parsers in Action -------------------------- + +We will create the following simple ``DataClass`` with one example. +And we will demonstrate how to use ``JsonOutputParser`` and ``YamlOutputParser`` to parse another example to dict object. + +.. code-block:: python + + from dataclasses import dataclass, field + from lightrag.core import DataClass + + @dataclass + class User(DataClass): + id: int = field(default=1, metadata={"description": "User ID"}) + name: str = field(default="John", metadata={"description": "User name"}) + + user_example = User(id=1, name="John") + +**JsonOutputParser** + +Here is how to use ``JsonOutputParser``: + +.. code-block:: python + + from lightrag.components.output_parsers import JsonOutputParser + + parser = JsonOutputParser(data_class=User, examples=[user_example]) + print(parser) + +The structure of it: + +.. code-block:: + + JsonOutputParser( + data_class=User, examples=[json_output_parser..User(id=1, name='John')], exclude_fields=None + (json_output_format_prompt): Prompt( + template: Your output should be formatted as a standard JSON instance with the following schema: + ``` + {{schema}} + ``` + {% if example %} + Examples: + ``` + {{example}} + ``` + {% endif %} + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -Follow the JSON formatting conventions., prompt_variables: ['example', 'schema'] + ) + (output_processors): JsonParser() + ) + +The output format string will be: + +.. code-block:: + + Your output should be formatted as a standard JSON instance with the following schema: + ``` + { + "id": " (int) (optional)", + "name": " (str) (optional)" + } + ``` + Examples: + ``` + { + "id": 1, + "name": "John" + } + ________ + ``` + -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output! + -Use double quotes for the keys and string values. + -Follow the JSON formatting conventions. + +Call the parser with the following string: + +.. code-block:: python + + user_to_parse = '{"id": 2, "name": "Jane"}' + parsed_user = parser(user_to_parse) + print(parsed_user) + +The output will be: + +.. code-block:: python + + {'id': 2, 'name': 'Jane'} + +**YamlOutputParser** + +The steps are totally the same as the ``JsonOutputParser``. + +.. code-block:: python + + from lightrag.components.output_parsers import YamlOutputParser + + parser = YamlOutputParser(data_class=User, examples=[user_example]) + print(parser) + +The structure of it: + +.. code-block:: + + YamlOutputParser( + data_class=.User'>, examples=[yaml_output_parser..User(id=1, name='John')] + (yaml_output_format_prompt): Prompt( + template: Your output should be formatted as a standard YAML instance with the following schema: + ``` + {{schema}} + ``` + {% if example %} + Examples: + ``` + {{example}} + ``` + {% endif %} + + -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output! + -Follow the YAML formatting conventions with an indent of 2 spaces. + -Quote the string values properly., prompt_variables: ['schema', 'example'] + ) + (output_processors): YamlParser() + ) + +The output format string will be: + +.. code-block:: + + Your output should be formatted as a standard YAML instance with the following schema: + ``` + id: (int) (optional) + name: (str) (optional) + ``` + Examples: + ``` + id: 1 + name: John + + ________ + ``` + + -Make sure to always enclose the YAML output in triple backticks (```). Please do not add anything other than valid YAML output! + -Follow the YAML formatting conventions with an indent of 2 spaces. + -Quote the string values properly. + +Now, let us parse the following string: + +.. code-block:: python + + user_to_parse = "id: 2\nname: Jane" + parsed_user = parser(user_to_parse) + print(parsed_user) + +The output will be: + +.. code-block:: python + + {'id': 2, 'name': 'Jane'} .. # todo .. Evaluate Format following .. -------------------------- diff --git a/lightrag/lightrag/components/output_parsers/outputs.py b/lightrag/lightrag/components/output_parsers/outputs.py index 337c85709..211835e92 100644 --- a/lightrag/lightrag/components/output_parsers/outputs.py +++ b/lightrag/lightrag/components/output_parsers/outputs.py @@ -51,10 +51,7 @@ -Quote the string values properly.""" LIST_OUTPUT_FORMAT = r"""Your output should be formatted as a standard Python list. --Each element can be of any Python data type such as string, integer, float, list, dictionary, etc. --You can also have nested lists and dictionaries. --Please do not add anything other than valid Python list output! -""" +- Start the list with '[' and end with ']'""" YAML_OUTPUT_PARSER_OUTPUT_TYPE = Dict[str, Any] @@ -139,13 +136,18 @@ def __init__( if not is_dataclass(data_class): raise ValueError(f"Provided class is not a dataclass: {data_class}") + if not issubclass(data_class, DataClass): + raise ValueError( + f"Provided class is not a subclass of DataClass: {data_class}" + ) + # ensure example is instance of data class and initiated if examples is not None and not isinstance(examples[0], data_class): raise ValueError( f"Provided example is not an instance of the data class: {data_class}" ) self._exclude_fields = exclude_fields - self.data_class_for_yaml: DataClass = data_class + self.data_class: DataClass = data_class self.yaml_output_format_prompt = Prompt(template=YAML_OUTPUT_FORMAT) self.output_processors = YamlParser() self.examples = examples @@ -163,7 +165,7 @@ def format_instructions( exclude (List[str], optional): The fields to exclude from the schema of the data class. """ format_type = format_type or DataClassFormatType.SIGNATURE_YAML - schema = self.data_class_for_yaml.format_class_str( + schema = self.data_class.format_class_str( format_type=format_type, exclude=self._exclude_fields ) # convert example to string, convert data class to yaml string @@ -189,7 +191,7 @@ def call(self, input: str) -> YAML_OUTPUT_PARSER_OUTPUT_TYPE: return self.output_processors(input) def _extra_repr(self) -> str: - s = f"data_class_for_yaml={self.data_class_for_yaml}, examples={self.examples}" + s = f"data_class={self.data_class}, examples={self.examples}" return s @@ -204,13 +206,18 @@ def __init__( if not is_dataclass(data_class): raise ValueError(f"Provided class is not a dataclass: {data_class}") + if not issubclass(data_class, DataClass): + raise ValueError( + f"Provided class is not a subclass of DataClass: {data_class}" + ) + if examples is not None and not isinstance(examples[0], data_class): raise ValueError( f"Provided example is not an instance of the data class: {data_class}" ) self._exclude_fields = exclude_fields template = JSON_OUTPUT_FORMAT - self.data_class_for_json: DataClass = data_class + self.data_class: DataClass = data_class self.json_output_format_prompt = Prompt(template=template) self.output_processors = JsonParser() self.examples = examples @@ -228,7 +235,7 @@ def format_instructions( Options: DataClassFormatType.SIGNATURE_YAML, DataClassFormatType.SIGNATURE_JSON, DataClassFormatType.SCHEMA. """ format_type = format_type or DataClassFormatType.SIGNATURE_JSON - schema = self.data_class_for_json.format_class_str( + schema = self.data_class.format_class_str( format_type=format_type, exclude=self._exclude_fields ) example_str = "" @@ -244,6 +251,7 @@ def format_instructions( log.debug(f"{__class__.__name__} example_str: {example_str}") except Exception: + log.error(f"Error in formatting example for {__class__.__name__}") example_str = None return self.json_output_format_prompt(schema=schema, example=example_str) @@ -251,7 +259,7 @@ def call(self, input: str) -> Any: return self.output_processors(input) def _extra_repr(self) -> str: - s = f"data_class_for_json={self.data_class_for_json}, examples={self.examples}, exclude_fields={self._exclude_fields}" + s = f"""data_class={self.data_class.__name__}, examples={self.examples}, exclude_fields={self._exclude_fields}""" return s