Add and use ElasticsearchChatMessageHistory

demjened · demjened · commit 8d792d8c2fb3 · 2023-08-14T12:12:09.000-04:00
diff --git a/notebooks/generative-ai/chatbot.ipynb b/notebooks/generative-ai/chatbot.ipynb
@@ -295,7 +295,7 @@
    "source": [
     "## Chat with the chatbot 💬\n",
     "\n",
-    "Let's initialize our chatbot. We'll define Elasticsearch as a store for retrieving documents, OpenAI as the LLM to interpret questions and summarize answers, then we'll pass these to the conversational chain."
+    "Let's initialize our chatbot. We'll define Elasticsearch as a store for retrieving documents and for storing the chat session history, OpenAI as the LLM to interpret questions and summarize answers, then we'll pass these to the conversational chain."
    ]
   },
   {
@@ -307,6 +307,8 @@
     "from langchain.vectorstores.elastic_vector_search import ElasticKnnSearch\n",
     "from langchain.llms import OpenAI\n",
     "from langchain.chains import ConversationalRetrievalChain\n",
+    "from lib.elasticsearch_chat_message_history import ElasticsearchChatMessageHistory\n",
+    "from uuid import uuid4\n",
     "\n",
     "store = ElasticKnnSearch(\n",
     "    es_connection=elasticsearch_client,\n",
@@ -322,6 +324,13 @@
     "    llm=llm,\n",
     "    retriever=retriever,\n",
     "    return_source_documents=True\n",
+    ")\n",
+    "\n",
+    "session_id = str(uuid4())\n",
+    "chat_history = ElasticsearchChatMessageHistory(\n",
+    "    client=elasticsearch_client,\n",
+    "    session_id=session_id,\n",
+    "    index='workplace-docs-chat-history'\n",
     ")"
    ]
   },
@@ -343,33 +352,34 @@
      "name": "stdout",
      "output_type": "stream",
      "text": [
-      "QUESTION:  What does NASA stand for? \n",
-      "ANSWER:    NASA stands for North America South America. \n",
-      "SUPPORTING DOCUMENTS:  ['Sales Organization Overview', 'Code Of Conduct', 'Code Of Conduct', 'Swe Career Matrix']\n",
-      "QUESTION:  Which countries are part of it? \n",
-      "ANSWER:    The North America South America region includes the United States, Canada, Mexico, as well as Central and South America. \n",
-      "SUPPORTING DOCUMENTS:  ['Sales Organization Overview', 'Sales Organization Overview', 'Sales Organization Overview', 'Fy2024 Company Sales Strategy']\n",
-      "QUESTION:  Who are the team's leads? \n",
-      "ANSWER:    Laura Martinez is the Area Vice-President of North America, and Gary Johnson is the Area Vice-President of South America. \n",
-      "SUPPORTING DOCUMENTS:  ['Sales Organization Overview', 'Sales Organization Overview', 'Swe Career Matrix', 'Swe Career Matrix']\n"
+      "[CHAT SESSION ID] 09116274-f852-4ae6-9617-c5aa2a17bbff\n",
+      "[QUESTION] What does NASA stand for?\n",
+      "[ANSWER]   NASA stands for North America South America region.\n",
+      "          [SUPPORTING DOCUMENTS] ['Sales Organization Overview', 'Code Of Conduct', 'Code Of Conduct', 'Swe Career Matrix']\n",
+      "[QUESTION] Which countries are part of it?\n",
+      "[ANSWER]   The North America South America region includes the United States, Canada, Mexico, as well as Central and South America.\n",
+      "          [SUPPORTING DOCUMENTS] ['Sales Organization Overview', 'Sales Organization Overview', 'Sales Organization Overview', 'Wfh Policy Update May 2023']\n",
+      "[QUESTION] Who are the team's leads?\n",
+      "[ANSWER]   Laura Martinez is the Area Vice-President of North America, and Gary Johnson is the Area Vice-President of South America.\n",
+      "          [SUPPORTING DOCUMENTS] ['Sales Organization Overview', 'Swe Career Matrix', 'Sales Organization Overview', 'Swe Career Matrix']\n"
      ]
     }
    ],
    "source": [
     "# Define a convenience function for Q&A\n",
-    "def ask(question, history):\n",
-    "    result = chat({\"question\": question, \"chat_history\": chat_history})\n",
-    "    print(\"QUESTION: \", question,\n",
-    "          \"\\nANSWER:  \", result[\"answer\"],\n",
-    "          \"\\nSUPPORTING DOCUMENTS: \", list(map(lambda d: d.metadata[\"name\"], list(result[\"source_documents\"])))\n",
-    "    )\n",
-    "    history.append((question, result[\"answer\"]))\n",
-    "    \n",
-    "chat_history = []\n",
-    "\n",
+    "def ask(question, chat_history):\n",
+    "    result = chat({\"question\": question, \"chat_history\": chat_history.messages})\n",
+    "    print(f\"\"\"[QUESTION] {question}\n",
+    "[ANSWER]  {result[\"answer\"]}\n",
+    "          [SUPPORTING DOCUMENTS] {list(map(lambda d: d.metadata[\"name\"], list(result[\"source_documents\"])))}\"\"\")\n",
+    "    chat_history.add_user_message(result[\"question\"])\n",
+    "    chat_history.add_ai_message(result[\"answer\"])\n",
+    "\n",
+    "# Chat away!\n",
+    "print(f\"[CHAT SESSION ID] {session_id}\")\n",
     "ask(\"What does NASA stand for?\", chat_history)\n",
     "ask(\"Which countries are part of it?\", chat_history)\n",
-    "ask(\"Who are the team's leads?\", chat_history)\n"
+    "ask(\"Who are the team's leads?\", chat_history)"
    ]
   },
   {
@@ -385,7 +395,23 @@
    "source": [
     "# (Optional) Clean up 🧹\n",
     "\n",
-    "Once we're done, we can delete the Elasticsearch index."
+    "Once we're done, we can clean up the chat history for this session..."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "chat_history.clear()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "... or delete the indices."
    ]
   },
   {
@@ -394,7 +420,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "elasticsearch_client.indices.delete(index='workplace-docs')"
+    "elasticsearch_client.indices.delete(index='workplace-docs')\n",
+    "elasticsearch_client.indices.delete(index='workplace-docs-chat-history')"
    ]
   }
  ],
diff --git a/notebooks/generative-ai/lib/elasticsearch_chat_message_history.py b/notebooks/generative-ai/lib/elasticsearch_chat_message_history.py
@@ -0,0 +1,81 @@
+import json
+import logging
+from typing import List
+from elasticsearch import ApiError, Elasticsearch
+
+from langchain.schema import BaseChatMessageHistory
+from langchain.schema.messages import BaseMessage, _message_to_dict, messages_from_dict
+
+logger = logging.getLogger(__name__)
+
+class ElasticsearchChatMessageHistory(BaseChatMessageHistory):
+    """Chat message history that stores history in Elasticsearch.
+
+    Args:
+        client: Elasticsearch client.
+        index: name of the index to use.
+        session_id: arbitrary key that is used to store the messages
+            of a single chat session.
+    """
+
+    def __init__(
+        self,
+        client: Elasticsearch,
+        index: str,
+        session_id: str,
+    ):
+        self.client: Elasticsearch = client
+        self.index: str = index
+        self.session_id: str = session_id
+
+        if not client.indices.exists(index=index):
+            client.indices.create(
+                index=index,
+                mappings={
+                    "properties": {
+                        "session_id": {"type": "keyword"},
+                        "history": {"type": "text"}
+                    }
+                }
+            )
+
+    @property
+    def messages(self) -> List[BaseMessage]:
+        """Retrieve the messages from Elasticsearch"""
+        try:
+            result = self.client.search(
+                index=self.index,
+                query={"term": {"session_id": self.session_id}}
+            )
+        except ApiError as err:
+            logger.error(err)
+
+        if result and len(result["hits"]["hits"]) > 0:
+            items = [json.loads(document["_source"]["history"]) for document in result["hits"]["hits"]]
+        else:
+            items = []
+
+        return messages_from_dict(items)
+
+    def add_message(self, message: BaseMessage) -> None:
+        """Add a message to the chat session in Elasticsearch"""
+        try:
+            self.client.index(
+                index=self.index,
+                body={
+                    "session_id": self.session_id,
+                    "history": json.dumps(_message_to_dict(message))
+                }
+            )
+        except ApiError as err:
+            logger.error(err)
+
+    def clear(self) -> None:
+        """Clear session memory in Elasticsearch"""
+        try:
+            self.client.delete_by_query(
+                index=self.index,
+                query={"term": {"session_id": self.session_id}}
+            )
+        except ApiError as err:
+            logger.error(err)