Merge pull request SylphAI-Inc#75 from SylphAI-Inc/li

[documentation]
CloudBee7 · Jul 4, 2024 · 02898b7 · 02898b7
2 parents a3064fb + 7429f3f
commit 02898b7
Show file tree

Hide file tree

Showing 81 changed files with 439 additions and 728 deletions.
diff --git a/README.md b/README.md
@@ -1,11 +1,11 @@
 ![LightRAG Logo](https://raw.githubusercontent.com/SylphAI-Inc/LightRAG/main/docs/source/_static/images/LightRAG-logo-doc.jpeg)
 
-## ⚡⚡⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡⚡⚡
+### ⚡⚡⚡ The PyTorch Library for Large language Model (LLM) Applications ⚡⚡⚡
 
 *LightRAG* helps developers with both building and optimizing *Retriever-Agent-Generator (RAG)* pipelines.
 It is *light*, *modular*, and *robust*.
 
-
+<!--
 
 **PyTorch**
 
@@ -30,37 +30,130 @@ class Net(nn.Module):
       x = self.dropout2(x)
       x = self.fc1(x)
       return self.fc2(x)
-```
+``` -->
+## LightRAG Task Pipeline
+
 
-**LightRAG**
+We will ask the model to response with ``explaination`` and ``example`` of a concept. And we built the pipeline to get the structured output as ``QAOutput``.
 
 ```python
 
-from lightrag.core import Component, Generator
-from lightrag.components.model_client import GroqAPIClient
+from dataclasses import dataclass, field
 
-class SimpleQA(Component):
-   def __init__(self):
-      super().__init__()
-      template = r"""<SYS>
-      You are a helpful assistant.
-      </SYS>
-      User: {{input_str}}
-      You:
-      """
-      self.generator = Generator(
+from lightrag.core import Component, Generator, fun_to_component
+from lightrag.components.model_client import GroqAPIClient
+from lightrag.core import DataClass, fun_to_component, Sequential
+from lightrag.components.output_parsers import JsonOutputParser
+
+@dataclass
+class QAOutput(DataClass):
+    explaination: str = field(
+        metadata={"desc": "A brief explaination of the concept in one sentence."}
+    )
+    example: str = field(metadata={"desc": "An example of the concept in a sentence."})
+
+
+@fun_to_component
+def to_qa_output(data: dict) -> QAOutput:
+    return QAOutput.from_dict(data)
+
+
+class QA(Component):
+    def __init__(self):
+        super().__init__()
+        template = r"""<SYS>
+        You are a helpful assistant.
+        <OUTPUT_FORMAT>
+        {{output_format_str}}
+        </OUTPUT_FORMAT>
+        </SYS>
+        User: {{input_str}}
+        You:
+        """
+        parser = JsonOutputParser(data_class=QAOutput)
+        self.generator = Generator(
             model_client=GroqAPIClient(),
             model_kwargs={"model": "llama3-8b-8192"},
             template=template,
+            prompt_kwargs={"output_format_str": parser.format_instructions()},
+            output_processors=Sequential(parser, to_qa_output),
+        )
+
+    def call(self, query: str):
+        return self.generator.call({"input_str": query})
+
+    async def acall(self, query: str):
+        return await self.generator.acall({"input_str": query})
+```
+
+
+Run the following code for visualization and calling the model.
+
+```python
+
+qa = QA()
+print(qa)
+
+# call
+output = qa("What is LLM?")
+print(output)
+```
+
+**Structure of the pipeline**
+
+Here is what we get from ``print(qa)``:
+
+```
+QA(
+  (generator): Generator(
+    model_kwargs={'model': 'llama3-8b-8192'},
+    (prompt): Prompt(
+      template: <SYS>
+              You are a helpful assistant.
+              <OUTPUT_FORMAT>
+              {{output_format_str}}
+              </OUTPUT_FORMAT>
+              </SYS>
+              User: {{input_str}}
+              You:
+              , prompt_kwargs: {'output_format_str': 'Your output should be formatted as a standard JSON instance with the following schema:\n```\n{\n    "explaination": "A brief explaination of the concept in one sentence. (str) (required)",\n    "example": "An example of the concept in a sentence. (str) (required)"\n}\n```\n-Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!\n-Use double quotes for the keys and string values.\n-Follow the JSON formatting conventions.'}, prompt_variables: ['output_format_str', 'input_str']
+    )
+    (model_client): GroqAPIClient()
+    (output_processors): Sequential(
+      (0): JsonOutputParser(
+        data_class=QAOutput, examples=None, exclude_fields=None
+        (json_output_format_prompt): Prompt(
+          template: Your output should be formatted as a standard JSON instance with the following schema:
+          ```
+          {{schema}}
+          ```
+          {% if example %}
+          Examples:
+          ```
+          {{example}}
+          ```
+          {% endif %}
+          -Make sure to always enclose the JSON output in triple backticks (```). Please do not add anything other than valid JSON output!
+          -Use double quotes for the keys and string values.
+          -Follow the JSON formatting conventions., prompt_variables: ['schema', 'example']
+        )
+        (output_processors): JsonParser()
       )
+      (1): ToQaOutputComponent(fun_name=to_qa_output)
+    )
+  )
+)
+```
 
-   def call(self, query):
-      return self.generator({"input_str": query})
+**The output**
 
-   async def acall(self, query):
-      return await self.generator.acall({"input_str": query})
+Here is what we get from ``print(output)``:
+
+```
+GeneratorOutput(data=QAOutput(explaination='LLM stands for Large Language Model, which refers to a type of artificial intelligence designed to process and generate human-like language.', example='For example, a LLM can be trained to generate news articles, conversations, or even entire books, and can be used for a variety of applications such as language translation, text summarization, and chatbots.'), error=None, usage=None, raw_response='```\n{\n  "explaination": "LLM stands for Large Language Model, which refers to a type of artificial intelligence designed to process and generate human-like language.",\n  "example": "For example, a LLM can be trained to generate news articles, conversations, or even entire books, and can be used for a variety of applications such as language translation, text summarization, and chatbots."\n}', metadata=None)
 ```
 
+
 ## Quick Install
 
 Install LightRAG with pip:
@@ -81,7 +174,7 @@ LightRAG full documentation available at [lightrag.sylph.ai](https://lightrag.sy
 - [Introduction](https://lightrag.sylph.ai/)
 - [Full installation guide](https://lightrag.sylph.ai/get_started/installation.html)
 - [Design philosophy](https://lightrag.sylph.ai/developer_notes/lightrag_design_philosophy.html): Design based on three principles: Simplicity over complexity, Quality over quantity, and Optimizing over building.
-- [Class hierarchy](https://lightrag.sylph.ai/developer_notes/class_hierarchy.html): We have no more than two levels of subclasses. The bare minimum abstraction will developers with maximum customizability and simplicity.
+- [Class hierarchy](https://lightrag.sylph.ai/developer_notes/class_hierarchy.html): We have no more than two levels of subclasses. The bare minimum abstraction provides developers with maximum customizability and simplicity.
 - [Tutorials](https://lightrag.sylph.ai/developer_notes/index.html): Learn the `why` and `how-to` (customize and integrate) behind each core part within the `LightRAG` library.
 - [API reference](https://lightrag.sylph.ai/apis/index.html)
 
@@ -95,11 +188,12 @@ LightRAG full documentation available at [lightrag.sylph.ai](https://lightrag.sy
 # Citation
 
 ```bibtex
-@software{Yin-LightRAG-2024,
-  author = {Yin, Li},
-  title = {{LightRAG: The PyTorch Library for Large language Model (LLM) Applications}},
+@software{Yin2024LightRAG,
+  author = {Li Yin},
+  title = {{LightRAG: The PyTorch Library for Large Language Model (LLM) Applications}},
   month = {7},
   year = {2024},
+  doi = {10.5281/zenodo.12639531},
   url = {https://github.com/SylphAI-Inc/LightRAG}
 }
 ```
diff --git a/benchmarks/ReAct_agent/fever/fever.py b/benchmarks/ReAct_agent/fever/fever.py
@@ -7,13 +7,13 @@
 """
 
 import dotenv
-from components.api_client.openai_client import OpenAIClient
-from components.agent.react_agent import ReActAgent
-from core.func_tool import FunctionTool
-from components.api_client import GroqAPIClient
+from lightrag.components.model_client.openai_client import OpenAIClient
+from lightrag.components.agent.react import ReActAgent
+from lightrag.core.func_tool import FunctionTool
+from lightrag.components.model_client import GroqAPIClient
 import time
 from benchmarks.ReAct_agent.utils.tools import search, lookup, normalize_answer
-from eval.evaluators import AnswerMacthEvaluator
+from lightrag.eval.answer_match_acc import AnswerMatchAcc
 import logging
 import json
 from typing import List, Optional, Any, Dict
@@ -229,8 +229,8 @@ def experiment(
 
 
 # setup evaluators
-EM_evaluator = AnswerMacthEvaluator(type="exact_match")
-FM_evaluator = AnswerMacthEvaluator(type="fuzzy_match")
+EM_evaluator = AnswerMatchAcc(type="exact_match")
+FM_evaluator = AnswerMatchAcc(type="fuzzy_match")
 
 # load test data
 file = open("./tests/benchmark/ReAct_agent/paper_data/paper_dev_10.json")

diff --git a/class_hierarchy_edges.csv b/class_hierarchy_edges.csv
@@ -1,3 +1,6 @@
+Component,BooleanParser
+Component,IntParser
+Component,FloatParser
 Component,ListParser
 Component,JsonParser
 Component,YamlParser
@@ -15,7 +18,6 @@ Component,FunComponent
 Component,ReActAgent
 Component,OutputParser
 Component,TextSplitter
-Component,DocumentSplitter
 Component,ToEmbeddings
 Component,RetrieverOutputToContextStr
 Component,DefaultLLMJudge
@@ -56,8 +58,6 @@ DataClass,DialogTurn
 DataClass,Instruction
 DataClass,GeneratorStatesRecord
 DataClass,GeneratorCallRecord
-Generator,CoTGenerator
-Generator,CoTGeneratorWithJsonOutput
 OutputParser,YamlOutputParser
 OutputParser,JsonOutputParser
 OutputParser,ListOutputParser

diff --git a/developer_notes/__init__.py b/developer_notes/__init__.py
@@ -1,3 +1,4 @@
 from lightrag.utils import setup_env
 
+print("initiating setup_env()...")
 setup_env()
diff --git a/developer_notes/generator_note.py b/developer_notes/generator_note.py
@@ -1,5 +1,12 @@
+from dataclasses import dataclass, field
+
 from lightrag.core import Component, Generator
 from lightrag.components.model_client import GroqAPIClient
+from lightrag.core import DataClass, fun_to_component, Sequential
+from lightrag.components.output_parsers import JsonOutputParser
+from lightrag.utils import setup_env
+
+setup_env()
 
 
 class SimpleQA(Component):
@@ -24,6 +31,47 @@ async def acall(self, query):
         return await self.generator.acall({"input_str": query})
 
 
+@dataclass
+class QAOutput(DataClass):
+    explaination: str = field(
+        metadata={"desc": "A brief explaination of the concept in one sentence."}
+    )
+    example: str = field(metadata={"desc": "An example of the concept in a sentence."})
+
+
+@fun_to_component
+def to_qa_output(data: dict) -> QAOutput:
+    return QAOutput.from_dict(data)
+
+
+class QA(Component):
+    def __init__(self):
+        super().__init__()
+        template = r"""<SYS>
+        You are a helpful assistant.
+        <OUTPUT_FORMAT>
+        {{output_format_str}}
+        </OUTPUT_FORMAT>
+        </SYS>
+        User: {{input_str}}
+        You:
+        """
+        parser = JsonOutputParser(data_class=QAOutput)
+        self.generator = Generator(
+            model_client=GroqAPIClient(),
+            model_kwargs={"model": "llama3-8b-8192"},
+            template=template,
+            prompt_kwargs={"output_format_str": parser.format_instructions()},
+            output_processors=Sequential(parser, to_qa_output),
+        )
+
+    def call(self, query: str):
+        return self.generator.call({"input_str": query})
+
+    async def acall(self, query: str):
+        return await self.generator.acall({"input_str": query})
+
+
 def minimum_generator():
     from lightrag.core import Generator
     from lightrag.components.model_client import GroqAPIClient
@@ -169,13 +217,18 @@ def create_purely_from_config_2():
 
 
 if __name__ == "__main__":
-    qa = SimpleQA()
-    answer = qa("What is LightRAG?")
-    print(qa)
-
-    minimum_generator()
-    use_a_json_parser()
-    use_its_own_template()
-    use_model_client_enum_to_switch_client()
-    create_purely_from_config()
-    create_purely_from_config_2()
+    qa1 = SimpleQA()
+    answer = qa1("What is LightRAG?")
+    print(qa1)
+
+    qa2 = QA()
+    answer = qa2("What is LLM?")
+    print(qa2)
+    print(answer)
+
+    # minimum_generator()
+    # use_a_json_parser()
+    # use_its_own_template()
+    # use_model_client_enum_to_switch_client()
+    # create_purely_from_config()
+    # create_purely_from_config_2()
diff --git a/docs/Makefile b/docs/Makefile
@@ -1,12 +1,10 @@
 # Minimal makefile for Sphinx documentation
-#
-
 
 # You can set these variables from the command line.
 SPHINXOPTS    =
 SPHINXBUILD   = sphinx-build
 SPHINXPROJ    = LightRAG
-SOURCEDIR     = source# the source of output and conf.py
+SOURCEDIR     = source
 BUILDDIR      = build
 APIDOCOUTDIR  = $(SOURCEDIR)/apis
 
@@ -28,6 +26,12 @@ apidoc:
 	@sphinx-apidoc -o $(APIDOCOUTDIR)/optim ../lightrag/lightrag/optim --separate --force
 	@sphinx-apidoc -o $(APIDOCOUTDIR)/utils ../lightrag/lightrag/utils --separate --force
 	@sphinx-apidoc -o $(APIDOCOUTDIR)/tracing ../lightrag/lightrag/tracing --separate --force
+
+generate_autosummary:
+	@echo "Generating autosummary files"
+	@sphinx-autogen $(SOURCEDIR)/**/*.rst
+
+update_files:
 	@echo "Inserting reference labels into RST files."
 	@python $(SOURCEDIR)/insert_labels.py
 	@echo "Removing unnecessary strings for better formatting"
@@ -36,9 +40,6 @@ apidoc:
 	@python $(SOURCEDIR)/remove_files.py
 	@echo "Renaming and updating file"
 	@python $(SOURCEDIR)/change_api_file_name.py
-	# @echo "Renaming and updating file"
-	# @python $(SOURCEDIR)/change_api_file_name_autosummary.py
-
 
-html: apidoc
+html: apidoc generate_autosummary update_files
 	@$(SPHINXBUILD) -b html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)