-
Notifications
You must be signed in to change notification settings - Fork 5.7k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
feat(mongo reader): field_extractors #18063
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,7 @@ | ||
"""Mongo client.""" | ||
|
||
from typing import Dict, Iterable, List, Optional, Union | ||
from collections.abc import Callable | ||
from typing import Dict, Iterable, List, Optional | ||
|
||
from llama_index.core.readers.base import BaseReader | ||
from llama_index.core.schema import Document | ||
|
@@ -40,12 +41,6 @@ def __init__( | |
|
||
self.client = client | ||
|
||
def _flatten(self, texts: List[Union[str, List[str]]]) -> List[str]: | ||
result = [] | ||
for text in texts: | ||
result += text if isinstance(text, list) else [text] | ||
return result | ||
|
||
def lazy_load_data( | ||
self, | ||
db_name: str, | ||
|
@@ -55,6 +50,7 @@ def lazy_load_data( | |
query_dict: Optional[Dict] = None, | ||
max_docs: int = 0, | ||
metadata_names: Optional[List[str]] = None, | ||
field_extractors: Optional[Dict[str, Callable[..., str]]] = None, | ||
) -> Iterable[Document]: | ||
"""Load data from the input directory. | ||
|
||
|
@@ -72,6 +68,9 @@ def lazy_load_data( | |
Defaults to 0 (no limit) | ||
metadata_names (Optional[List[str]]): names of the fields to be added | ||
to the metadata attribute of the Document. Defaults to None | ||
field_extractors (Optional[Dict[str, Callable[..., str]]]): dictionary | ||
containing field name and a function to extract text from the field. | ||
The default extractor function is `str`. Defaults to None. | ||
|
||
Returns: | ||
List[Document]: A list of documents. | ||
|
@@ -84,15 +83,18 @@ def lazy_load_data( | |
projection={name: 1 for name in field_names + (metadata_names or [])}, | ||
) | ||
|
||
field_extractors = field_extractors or {} | ||
|
||
for item in cursor: | ||
try: | ||
texts = [str(item[name]) for name in field_names] | ||
texts = [ | ||
field_extractors.get(name, str)(item[name]) for name in field_names | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It is |
||
] | ||
except KeyError as err: | ||
raise ValueError( | ||
f"{err.args[0]} field not found in Mongo document." | ||
) from err | ||
|
||
texts = self._flatten(texts) | ||
text = separator.join(texts) | ||
|
||
if metadata_names is None: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Why do we remove flatten?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
texts
is already of typelist[str]
, so calling_flatten(texts)
simply returnstexts
itself without any modifications. Therefore, removing_flatten
simplifies the code without changing its behavior.