Skip to content

Commit

Permalink
mbox: allow custom, stable document id (run-llama#393)
Browse files Browse the repository at this point in the history
* mbox: allow custom, stable document id

 * via function passed in `id_fn`, eg. `MboxReader(id_fn=lambda msg: md5(msg.encode()).hexdigest())`
 * overrides UUID-based default from Document

* mbox: use doc_id instead of id_

* mbox: make id_fn example more practical
  • Loading branch information
rc9000 authored Sep 25, 2023
1 parent 138565e commit 56d29d9
Show file tree
Hide file tree
Showing 2 changed files with 11 additions and 2 deletions.
3 changes: 3 additions & 0 deletions llama_hub/file/mbox/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,9 @@ from llama_index import download_loader
MboxReader = download_loader("MboxReader")
documents = MboxReader().load_data(file='./email.mbox') # Returns list of documents

# To customize the document id, pass an id_fn. The msg argument is the whole message as defined by `message_format`
docs = MboxReader(id_fn=lambda msg: md5(msg[:200].encode()).hexdigest()).load_data(file=d)

```

This loader is designed to be used as a way to load data into [LlamaIndex](https://github.com/jerryjliu/gpt_index/tree/main/gpt_index) and/or subsequently used as a Tool in a [LangChain](https://github.com/hwchase17/langchain) Agent. See [here](https://github.com/emptycrown/llama-hub/tree/main) for examples.
10 changes: 8 additions & 2 deletions llama_hub/file/mbox/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"""
from pathlib import Path
from typing import Any, Dict, List, Optional
from typing import Any, Callable, Dict, List, Optional

from llama_index.readers.base import BaseReader
from llama_index.readers.schema.base import Document
Expand Down Expand Up @@ -32,12 +32,14 @@ def __init__(
*args: Any,
max_count: int = 0,
message_format: str = DEFAULT_MESSAGE_FORMAT,
id_fn: Optional[Callable[[str], str]] = None,
**kwargs: Any
) -> None:
"""Init params."""
super().__init__(*args, **kwargs)
self.max_count = max_count
self.message_format = message_format
self.id_fn = id_fn

def parse_file(self, filepath: Path, errors: str = "ignore") -> List[str]:
"""Parse file into string."""
Expand Down Expand Up @@ -109,5 +111,9 @@ def load_data(
docs: List[Document] = []
content = self.parse_file(file)
for msg in content:
docs.append(Document(text=msg, extra_info=extra_info or {}))
d = Document(text=msg, extra_info=extra_info or {})
if self.id_fn:
d.doc_id = self.id_fn(msg)
docs.append(d)

return docs

0 comments on commit 56d29d9

Please sign in to comment.