Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
*.log
__pycache__
*.graphml
*.json
.env

graph
kv_store
vdb
12 changes: 8 additions & 4 deletions PathRAG/llm.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,8 +35,12 @@
logger,
)

from dotenv import load_dotenv

import sys

load_dotenv()

if sys.version_info < (3, 9):
from typing import AsyncIterator
else:
Expand All @@ -55,8 +59,8 @@ async def openai_complete_if_cache(
prompt,
system_prompt=None,
history_messages=[],
base_url="https://api.openai.com/v1",
api_key="",
base_url=os.getenv("BASE_URL"),
api_key=os.getenv("API_KEY"),
**kwargs,
) -> str:
if api_key:
Expand Down Expand Up @@ -764,8 +768,8 @@ async def zhipu_embedding(
async def openai_embedding(
texts: list[str],
model: str = "text-embedding-3-small",
base_url="https://api.openai.com/v1",
api_key="",
base_url=os.getenv("BASE_URL"),
api_key=os.getenv("API_KEY"),
) -> np.ndarray:
if api_key:
os.environ["OPENAI_API_KEY"] = api_key
Expand Down
9 changes: 9 additions & 0 deletions PathRAG/settings.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
from pathlib import Path
from dotenv import load_dotenv
import os

BASE_DIR = Path(__file__).resolve().parent.parent

GRAPH_FILE_PATH = os.path.join(BASE_DIR,'graph')
KV_STORE_FILE_PATH = os.path.join(BASE_DIR,'kv_store')
VDB_FILE__PATH = os.path.join(BASE_DIR,'vdb')
15 changes: 7 additions & 8 deletions PathRAG/storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,12 +21,15 @@
BaseVectorStorage,
)

from .settings import KV_STORE_FILE_PATH
from .settings import GRAPH_FILE_PATH
from .settings import VDB_FILE__PATH


@dataclass
class JsonKVStorage(BaseKVStorage):
def __post_init__(self):
working_dir = self.global_config["working_dir"]
self._file_name = os.path.join(working_dir, f"kv_store_{self.namespace}.json")
self._file_name = os.path.join(KV_STORE_FILE_PATH, f"kv_store_{self.namespace}.json")
self._data = load_json(self._file_name) or {}
logger.info(f"Load KV {self.namespace} with {len(self._data)} data")

Expand Down Expand Up @@ -68,9 +71,7 @@ class NanoVectorDBStorage(BaseVectorStorage):
cosine_better_than_threshold: float = 0.2

def __post_init__(self):
self._client_file_name = os.path.join(
self.global_config["working_dir"], f"vdb_{self.namespace}.json"
)
self._client_file_name = os.path.join(VDB_FILE__PATH, f"vdb_{self.namespace}.json")
self._max_batch_size = self.global_config["embedding_batch_num"]
self._client = NanoVectorDB(
self.embedding_func.embedding_dim, storage_file=self._client_file_name
Expand Down Expand Up @@ -242,9 +243,7 @@ def _get_edge_key(source: Any, target: Any) -> str:
return fixed_graph

def __post_init__(self):
self._graphml_xml_file = os.path.join(
self.global_config["working_dir"], f"graph_{self.namespace}.graphml"
)
self._graphml_xml_file = os.path.join(GRAPH_FILE_PATH, f"graph_{self.namespace}.graphml")
preloaded_graph = NetworkXStorage.load_nx_graph(self._graphml_xml_file)
if preloaded_graph is not None:
logger.info(
Expand Down
51 changes: 24 additions & 27 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,41 +2,38 @@ The code for the paper **"PathRAG: Pruning Graph-based Retrieval Augmented Gener
## Install
```bash
cd PathRAG
pip install -e .
pip install -e . # or pip install -r requirements.txt
```
## Quick Start
* You can quickly experience this project in the `v1_test.py` file.
* Set OpenAI API key in environment if using OpenAI models: `api_key="sk-...".` in the `v1_test.py` and `llm.py` file
* Prepare your retrieval document "text.txt".
* Use the following Python snippet in the "v1_text.py" file to initialize PathRAG and perform queries.

```python
import os
from PathRAG import PathRAG, QueryParam
from PathRAG.llm import gpt_4o_mini_complete

WORKING_DIR = "./your_working_dir"
api_key="your_api_key"
os.environ["OPENAI_API_KEY"] = api_key
base_url="https://api.openai.com/v1"
os.environ["OPENAI_API_BASE"]=base_url
## RUN the project

### Windows

if not os.path.exists(WORKING_DIR):
os.mkdir(WORKING_DIR)
```bash
python -m venv .venv # create virtual environment
.venv\Scripts\activate # activate the virtual environment
python v1_test.py # to run the project

rag = PathRAG(
working_dir=WORKING_DIR,
llm_model_func=gpt_4o_mini_complete,
)
# if it doesn't works properly then try reinstalling the packages using the above installation command
```

data_file="./text.txt"
question="your_question"
with open(data_file) as f:
rag.insert(f.read())
### Linux/Unix

print(rag.query(question, param=QueryParam(mode="hybrid")))
```bash
python3 -m venv .venv # create virtual environment
Source .venv\bin\activate # activate the virtual environment
python3 v1_test.py # to run the project

# if it doesn't works properly then try reinstalling the packages using the above installation command
```

## Quick Start
* You can quickly experience this project in the `v1_test.py` file.
* Rename `exampe.env` to `.env`
* Set OpenAI API key in `.env` file and the BASE URL.
* Prepare your retrieval document `text.txt`. You can modify this in the code in `v1_test.py`.
* The `v1_text.py` file is the entry point to initialize PathRAG and perform queries.

## Parameter modification
You can adjust the relevant parameters in the `base.py` and `operate.py` files.

Expand Down
2 changes: 2 additions & 0 deletions example.env
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
API_KEY=
BASE_URL=https://api.openai.com/v1
46 changes: 38 additions & 8 deletions v1_test.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,37 @@
import os
from PathRAG import PathRAG, QueryParam
from PathRAG.llm import gpt_4o_mini_complete
from pathlib import Path

WORKING_DIR = ""
from dotenv import load_dotenv

api_key=""
load_dotenv()

WORKING_DIR = "./PathRAG"

# Define storage paths
BASE_DIR = Path(__file__).resolve().parent
GRAPH_FILE_PATH = os.path.join(BASE_DIR, 'graph')
KV_STORE_FILE_PATH = os.path.join(BASE_DIR, 'kv_store')
VDB_FILE_PATH = os.path.join(BASE_DIR, 'vdb')

# Ensure directories exist
for path in [GRAPH_FILE_PATH, KV_STORE_FILE_PATH, VDB_FILE_PATH]:
os.makedirs(path, exist_ok=True)

# Ensure necessary JSON files exist
for file_name in ["kv_store_full_docs.json", "kv_store_text_chunks.json", "kv_store_llm_response_cache.json"]:
file_path = os.path.join(KV_STORE_FILE_PATH, file_name)
if not os.path.exists(file_path):
with open(file_path, "w", encoding="utf-8") as f:
f.write("{}") # Initialize with empty JSON object

# Set up API keys
api_key = os.getenv("API_KEY")
os.environ["OPENAI_API_KEY"] = api_key
base_url="https://api.openai.com/v1"
os.environ["OPENAI_API_BASE"]=base_url
base_url = os.getenv("BASE_URL")
os.environ["OPENAI_API_BASE"] = base_url



if not os.path.exists(WORKING_DIR):
Expand All @@ -18,10 +42,16 @@
llm_model_func=gpt_4o_mini_complete,
)

data_file=""
question=""
with open(data_file) as f:
rag.insert(f.read())
data_file="text.txt"
question="what is this document all about?"

with open(data_file, "r", encoding="utf-8") as f:
file_content = f.read().strip()

if not file_content:
raise ValueError("The input file is empty. Please provide valid content.")

rag.insert(file_content)

print(rag.query(question, param=QueryParam(mode="hybrid")))

Expand Down