Skip to content

Commit

Permalink
feat: improve pinecone tests (langchain-ai#2806)
Browse files Browse the repository at this point in the history
Improve the integration tests for Pinecone by adding an `.env.example`
file for local testing. Additionally, add some dev dependencies
specifically for integration tests.

This change also helps me understand how Pinecone deals with certain
things, see related issues
langchain-ai#2484
langchain-ai#2816
  • Loading branch information
sergerdn authored Apr 14, 2023
1 parent 016738e commit 04c458a
Show file tree
Hide file tree
Showing 11 changed files with 1,987 additions and 241 deletions.
264 changes: 144 additions & 120 deletions poetry.lock

Large diffs are not rendered by default.

3 changes: 3 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,9 @@ deeplake = "^3.2.21"
torch = "^1.0.0"
chromadb = "^0.3.21"
tiktoken = "^0.3.3"
python-dotenv = "^1.0.0"
gptcache = "^0.1.9"
promptlayer = "^0.1.80"

[tool.poetry.group.lint.dependencies]
ruff = "^0.0.249"
Expand Down
7 changes: 7 additions & 0 deletions tests/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,11 @@ cd tests/integration_tests/vectorstores/docker-compose
docker-compose -f elasticsearch.yml up
```

### Prepare environment variables for local testing:

- copy `tests/.env.example` to `tests/.env`
- set variables in `tests/.env` file, e.g `OPENAI_API_KEY`

Additionally, it's important to note that some integration tests may require certain
environment variables to be set, such as `OPENAI_API_KEY`. Be sure to set any required
environment variables before running the tests to ensure they run correctly.
Expand All @@ -54,7 +59,9 @@ cassettes. You can use the --vcr-record=none command-line option to disable reco
new cassettes. Here's an example:

```bash
pytest --log-cli-level=10 tests/integration_tests/vectorstores/test_pinecone.py --vcr-record=none
pytest tests/integration_tests/vectorstores/test_elasticsearch.py --vcr-record=none

```

### Run some tests with coverage:
Expand Down
9 changes: 9 additions & 0 deletions tests/integration_tests/.env.example
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# openai
# your api key from https://platform.openai.com/account/api-keys
OPENAI_API_KEY=

# pinecone
# your api key from left menu "API Keys" in https://app.pinecone.io
PINECONE_API_KEY=your_pinecone_api_key_here
# your pinecone environment from left menu "API Keys" in https://app.pinecone.io
PINECONE_ENVIRONMENT=us-west4-gcp
36 changes: 21 additions & 15 deletions tests/integration_tests/conftest.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,31 @@
import os
from pathlib import Path

import pytest

# Getting the absolute path of the current file's directory
ABS_PATH = os.path.dirname(os.path.abspath(__file__))

# Getting the absolute path of the project's root directory
PROJECT_DIR = os.path.abspath(os.path.join(ABS_PATH, os.pardir, os.pardir))


# Loading the .env file if it exists
def _load_env() -> None:
dotenv_path = os.path.join(PROJECT_DIR, "tests", "integration_tests", ".env")
if os.path.exists(dotenv_path):
from dotenv import load_dotenv

load_dotenv(dotenv_path)


_load_env()


@pytest.fixture(scope="module")
def test_dir() -> Path:
return Path(os.path.join(PROJECT_DIR, "tests", "integration_tests"))


# This fixture returns a string containing the path to the cassette directory for the
# current module
Expand All @@ -15,18 +36,3 @@ def vcr_cassette_dir(request: pytest.FixtureRequest) -> str:
"cassettes",
os.path.basename(request.module.__file__).replace(".py", ""),
)


# This fixture returns a dictionary containing filter_headers options
# for replacing certain headers with dummy values during cassette playback
# Specifically, it replaces the authorization header with a dummy value to
# prevent sensitive data from being recorded in the cassette.
@pytest.fixture(scope="module")
def vcr_config() -> dict:
return {
"filter_headers": [
("authorization", "authorization-DUMMY"),
("X-OpenAI-Client-User-Agent", "X-OpenAI-Client-User-Agent-DUMMY"),
("User-Agent", "User-Agent-DUMMY"),
],
}

Large diffs are not rendered by default.

Large diffs are not rendered by default.

Large diffs are not rendered by default.

53 changes: 51 additions & 2 deletions tests/integration_tests/vectorstores/conftest.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,49 @@
import os
from typing import Generator, List
from typing import Generator, List, Union

import pytest
from vcr.request import Request

from langchain.document_loaders import TextLoader
from langchain.embeddings import OpenAIEmbeddings
from langchain.schema import Document
from langchain.text_splitter import CharacterTextSplitter


# Define a fixture that yields a generator object returning a list of documents
# This fixture returns a dictionary containing filter_headers options
# for replacing certain headers with dummy values during cassette playback
# Specifically, it replaces the authorization header with a dummy value to
# prevent sensitive data from being recorded in the cassette.
# It also filters request to certain hosts (specified in the `ignored_hosts` list)
# to prevent data from being recorded in the cassette.
@pytest.fixture(scope="module")
def vcr_config() -> dict:
skipped_host = ["pinecone.io"]

def before_record_response(response: dict) -> Union[dict, None]:
return response

def before_record_request(request: Request) -> Union[Request, None]:
for host in skipped_host:
if request.host.startswith(host) or request.host.endswith(host):
return None
return request

return {
"before_record_request": before_record_request,
"before_record_response": before_record_response,
"filter_headers": [
("authorization", "authorization-DUMMY"),
("X-OpenAI-Client-User-Agent", "X-OpenAI-Client-User-Agent-DUMMY"),
("Api-Key", "Api-Key-DUMMY"),
("User-Agent", "User-Agent-DUMMY"),
],
"ignore_localhost": True,
}


# Define a fixture that yields a generator object returning a list of documents
@pytest.fixture(scope="function")
def documents() -> Generator[List[Document], None, None]:
"""Return a generator that yields a list of documents."""

Expand All @@ -23,3 +57,18 @@ def documents() -> Generator[List[Document], None, None]:

# Yield the documents split into chunks
yield text_splitter.split_documents(documents)


@pytest.fixture(scope="function")
def texts() -> Generator[List[str], None, None]:
# Load the documents from a file located in the fixtures directory
documents = TextLoader(
os.path.join(os.path.dirname(__file__), "fixtures", "sharks.txt")
).load()

yield [doc.page_content for doc in documents]


@pytest.fixture(scope="module")
def embedding_openai() -> OpenAIEmbeddings:
return OpenAIEmbeddings()
38 changes: 20 additions & 18 deletions tests/integration_tests/vectorstores/test_elasticsearch.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@


class TestElasticsearch:
@classmethod
def setup_class(cls) -> None:
if not os.getenv("OPENAI_API_KEY"):
raise ValueError("OPENAI_API_KEY environment variable is not set")

@pytest.fixture(scope="class", autouse=True)
def elasticsearch_url(self) -> Union[str, Generator[str, None, None]]:
"""Return the elasticsearch url."""
Expand All @@ -34,15 +39,6 @@ def elasticsearch_url(self) -> Union[str, Generator[str, None, None]]:
# print(index_name)
es.indices.delete(index=index_name)

@pytest.fixture(scope="class", autouse=True)
def openai_api_key(self) -> Union[str, Generator[str, None, None]]:
"""Return the OpenAI API key."""
openai_api_key = os.getenv("OPENAI_API_KEY")
if not openai_api_key:
raise ValueError("OPENAI_API_KEY environment variable is not set")

yield openai_api_key

def test_similarity_search_without_metadata(self, elasticsearch_url: str) -> None:
"""Test end to end construction and search without metadata."""
texts = ["foo", "bar", "baz"]
Expand All @@ -67,15 +63,17 @@ def test_similarity_search_with_metadata(self, elasticsearch_url: str) -> None:

@pytest.mark.vcr(ignore_localhost=True)
def test_default_index_from_documents(
self, documents: List[Document], openai_api_key: str, elasticsearch_url: str
self,
documents: List[Document],
embedding_openai: OpenAIEmbeddings,
elasticsearch_url: str,
) -> None:
"""This test checks the construction of a default
ElasticSearch index using the 'from_documents'."""
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)

elastic_vector_search = ElasticVectorSearch.from_documents(
documents=documents,
embedding=embedding,
embedding=embedding_openai,
elasticsearch_url=elasticsearch_url,
)

Expand All @@ -86,16 +84,18 @@ def test_default_index_from_documents(

@pytest.mark.vcr(ignore_localhost=True)
def test_custom_index_from_documents(
self, documents: List[Document], openai_api_key: str, elasticsearch_url: str
self,
documents: List[Document],
embedding_openai: OpenAIEmbeddings,
elasticsearch_url: str,
) -> None:
"""This test checks the construction of a custom
ElasticSearch index using the 'from_documents'."""

index_name = f"custom_index_{uuid.uuid4().hex}"
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
elastic_vector_search = ElasticVectorSearch.from_documents(
documents=documents,
embedding=embedding,
embedding=embedding_openai,
elasticsearch_url=elasticsearch_url,
index_name=index_name,
)
Expand All @@ -110,15 +110,17 @@ def test_custom_index_from_documents(

@pytest.mark.vcr(ignore_localhost=True)
def test_custom_index_add_documents(
self, documents: List[Document], openai_api_key: str, elasticsearch_url: str
self,
documents: List[Document],
embedding_openai: OpenAIEmbeddings,
elasticsearch_url: str,
) -> None:
"""This test checks the construction of a custom
ElasticSearch index using the 'add_documents'."""

index_name = f"custom_index_{uuid.uuid4().hex}"
embedding = OpenAIEmbeddings(openai_api_key=openai_api_key)
elastic_vector_search = ElasticVectorSearch(
embedding=embedding,
embedding=embedding_openai,
elasticsearch_url=elasticsearch_url,
index_name=index_name,
)
Expand Down
Loading

0 comments on commit 04c458a

Please sign in to comment.