Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add start and end indices to Node extra info during index construction #265

Merged
merged 2 commits into from
Jan 21, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion gpt_index/data_structs/data_structs.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import random
import sys
from dataclasses import dataclass, field
from typing import Dict, List, Optional, Set
from typing import Any, Dict, List, Optional, Set

from dataclasses_json import DataClassJsonMixin

Expand Down Expand Up @@ -51,6 +51,9 @@ def __post_init__(self) -> None:
# reference document id
ref_doc_id: Optional[str] = None

# extra node info
node_info: Optional[Dict[str, Any]] = None

def get_text(self) -> str:
"""Get text."""
text = super().get_text()
Expand Down
7 changes: 7 additions & 0 deletions gpt_index/indices/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -196,16 +196,23 @@ def _get_nodes_from_document(
"""Add document to index."""
text_chunks = text_splitter.split_text(document.get_text())
nodes = []
index_counter = 0
for i, text_chunk in enumerate(text_chunks):
fmt_text_chunk = truncate_text(text_chunk, 50)
print(f"> Adding chunk: {fmt_text_chunk}")
index_pos_info = {
"start": index_counter, # NOTE: start is inclusive
"end": index_counter + len(text_chunk), # NOTE: end is exclusive
}
index_counter += len(text_chunk) + 1
# if embedding specified in document, pass it to the Node
node = Node(
text=text_chunk,
index=start_idx + i,
ref_doc_id=document.get_doc_id(),
embedding=document.embedding,
extra_info=document.extra_info if self._include_extra_info else None,
node_info=index_pos_info,
)
nodes.append(node)
return nodes
Expand Down
18 changes: 18 additions & 0 deletions tests/indices/list/test_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -257,3 +257,21 @@ def test_extra_info(
assert list_index.index_struct.nodes[3].get_text() == (
"extra_info: extra_info\n" "foo: bar\n\n" "This is a test v2."
)


@patch_common
def test_node_indices(
_mock_init: Any,
_mock_predict: Any,
_mock_total_tokens_used: Any,
_mock_split_text: Any,
documents: List[Document],
struct_kwargs: Dict,
) -> None:
"""Test Node start and end indices info."""
index_kwargs, query_kwargs = struct_kwargs
index = GPTListIndex(documents, **index_kwargs)

assert index.index_struct.nodes[0].node_info == {"start": 0, "end": 12}
assert index.index_struct.nodes[1].node_info == {"start": 13, "end": 28}
assert index.index_struct.nodes[2].node_info == {"start": 29, "end": 50}