Add Zulip Reader (#181)

Co-authored-by: Barton Rhodes <barton@plurigrid.xyz>
run-llama · Apr 13, 2023 · 9a98b46 · 9a98b46
1 parent 2ab8ee6
commit 9a98b46
Show file tree

Hide file tree

Showing 5 changed files with 105 additions and 99 deletions.
diff --git a/loader_hub/library.json b/loader_hub/library.json
@@ -3,10 +3,6 @@
     "id": "asana",
     "author": "daveey"
   },
-  "AzCognitiveSearchReader": {
-    "id": "azcognitive_search",
-    "author": "mrcabellom"
-  },
   "GoogleDocsReader": {
     "id": "google_docs",
     "author": "jerryjliu"
@@ -60,10 +56,6 @@
     "id": "file/json",
     "author": "yisding"
   },
-  "JSONDataReader": {
-    "id": "jsondata",
-    "author": "Josh-XT"
-  },
   "MarkdownReader": {
     "id": "file/markdown",
     "author": "hursh-desai"
@@ -72,10 +64,6 @@
     "id": "file/audio",
     "author": "ravi03071991"
   },
-  "GladiaAudioTranscriber": {
-    "id": "file/audio_gladia",
-    "author": "ravi03071991"
-  },
   "SimpleCSVReader": {
     "id": "file/simple_csv",
     "author": "vguillet"
@@ -388,91 +376,8 @@
     "id": "airtable",
     "author": "smyja"
   },
-  "HatenaBlogReader": {
-    "id": "hatena_blog",
-    "author": "Shoya SHIRAKI",
-    "keywords": [
-      "hatena",
-      "blog"
-    ]
-  },
-  "OpendalReader": {
-    "id": "opendal_reader",
-    "author": "OpenDAL Contributors",
-    "keywords": [
-      "storage"
-    ]
-  },
-  "OpendalS3Reader": {
-    "id": "opendal_reader/s3",
-    "author": "OpenDAL Contributors",
-    "keywords": [
-      "storage",
-      "s3"
-    ]
-  },
-  "OpendalAzblobReader": {
-    "id": "opendal_reader/azblob",
-    "author": "OpenDAL Contributors",
-    "keywords": [
-      "storage",
-      "azblob"
-    ]
-  },
-  "OpendalGcsReader": {
-    "id": "opendal_reader/gcs",
-    "author": "OpenDAL Contributors",
-    "keywords": [
-      "storage",
-      "gcs"
-    ]
-  },
-  "ConfluenceReader": {
-    "id": "confluence",
-    "author": "zywilliamli"
-  },
-  "ChatGPTRetrievalPluginReader": {
-    "id": "chatgpt_plugin",
-    "author": "jerryjliu"
-  },
-  "JiraReader": {
-    "id": "jira",
-    "author": "bearguy",
-    "keywords": [
-      "jira"
-    ]
-  },
-  "UnstructuredURLLoader": {
-    "id": "web/unstructured_web",
-    "author": "kravetsmic",
-    "keywords": [
-      "unstructured.io",
-      "url"
-    ]
-  },
-  "GoogleSheetsReader": {
-    "id": "google_sheets",
-    "author": "piroz"
-  },
-  "FeedlyRssReader": {
-    "id": "feedly_rss",
-    "author": "kychanbp",
-    "keywords": [
-      "feedly",
-      "rss"
-    ]
-  },
-  "FlatPdfReader": {
-    "id": "file/flat_pdf",
-    "author": "emmanuel-oliveira",
-    "keywords": [
-      "pdf",
-      "flat",
-      "flattened"
-    ]
-  },
-  "MilvusReader": {
-    "id": "milvus",
-    "author": "filip-halt"
+  "ZulipReader": {
+    "id": "zulip",
+    "author": "plurigrid"
   }
-}
+}
diff --git a/loader_hub/zulip/README.md b/loader_hub/zulip/README.md
@@ -0,0 +1,32 @@
+## Zulip Loader
+
+The Zulip Loader is a Python script that allows you to load data from Zulip streams using a Zulip bot's API token. It fetches messages from specified streams or all streams if none are specified, and returns a list of documents with the stream content.
+
+### Prerequisites
+
+Create a Zulip bot and obtain its API token. Follow the instructions in the Zulip documentation to create a bot and get the API key (token).
+
+Set the ZULIP_TOKEN environment variable to your Zulip bot's API token:
+```bash
+export ZULIP_TOKEN="your-zulip-bot-api-token"
+```
+
+Use the ZulipReader class to load data from Zulip streams:
+
+```python
+
+from zulip_loader import ZulipReader
+
+# Initialize the ZulipReader with the bot's email and Zulip domain
+reader = ZulipReader(zulip_email="your-bot-email@your-zulip-domain.zulipchat.com", zulip_domain="your-zulip-domain.zulipchat.com")
+
+# Load data from all streams
+data = reader.load_data(reader.get_all_streams())
+
+# Load data from specific streams
+stream_names = ["stream1", "stream2"]
+data = reader.load_data(stream_names)
+# This will return a list of documents containing the content of the specified streams.
+```
+
+For more customization, you can pass the `reverse_chronological` parameter to the load_data() method to indicate the order of messages in the output.
diff --git a/loader_hub/zulip/__init__.py b/loader_hub/zulip/__init__.py
@@ -0,0 +1 @@
+"""Init file."""
diff --git a/loader_hub/zulip/base.py b/loader_hub/zulip/base.py
@@ -0,0 +1,67 @@
+import logging
+from typing import List, Optional
+from datetime import datetime
+import os
+from llama_index.readers.base import BaseReader
+from llama_index.readers.schema.base import Document
+
+logger = logging.getLogger(__name__)
+
+class ZulipReader(BaseReader):
+    """Zulip reader."""
+
+    def __init__(
+            self,
+            zulip_email: str,
+            zulip_domain: str,
+            earliest_date: Optional[datetime] = None,
+            latest_date: Optional[datetime] = None,
+        ) -> None:
+            import zulip
+
+            """Initialize with parameters."""
+            # Read the Zulip token from the environment variable
+            zulip_token = os.environ.get("ZULIP_TOKEN")
+
+            if zulip_token is None:
+                raise ValueError("ZULIP_TOKEN environment variable not set.")
+
+            # Initialize Zulip client with provided parameters
+            self.client = zulip.Client(api_key=zulip_token, email=zulip_email, site=zulip_domain)
+
+    def _read_stream(self, stream_name: str, reverse_chronological: bool) -> str:
+        """Read a stream."""
+        params = {
+            "narrow": [{"operator": "stream", "operand": stream_name}],
+            "anchor": "newest",
+            "num_before": 100,
+            "num_after": 0,
+        }
+        response = self.client.get_messages(params)
+        messages = response["messages"]
+        if reverse_chronological:
+            messages.reverse()
+        return " ".join([message["content"] for message in messages])
+
+    def load_data(
+        self, streams: List[str], reverse_chronological: bool = True
+    ) -> List[Document]:
+        """Load data from the input streams."""
+        # Load data logic here
+        data = []
+        for stream_name in streams:
+            stream_content = self._read_stream(stream_name, reverse_chronological)
+            data.append(Document(stream_content, extra_info={"stream": stream_name}))
+        return data
+
+    def get_all_streams(self) -> list:
+        # Fetch all streams
+        response = self.client.get_streams()
+        streams_data = response["streams"]
+        # Collect the stream IDs
+        stream_names = [stream['name'] for stream in streams_data]
+        return stream_names
+
+if __name__ == "__main__":
+    reader = ZulipReader(zulip_email="ianita-bot@plurigrid.zulipchat.com", zulip_domain="plurigrid.zulipchat.com")
+    logging.info(reader.load_data(reader.get_all_streams()))
diff --git a/loader_hub/zulip/requirements.txt b/loader_hub/zulip/requirements.txt
@@ -0,0 +1 @@
+zulip