explodinggradients · jjmachan · Oct 9, 2023 · Oct 7, 2023 · Oct 7, 2023 · Oct 7, 2023
diff --git a/.readthedocs.yml b/.readthedocs.yml
@@ -0,0 +1,23 @@
+version: 2
+
+# Set the OS, Python version and other tools you might need
+build:
+  os: ubuntu-22.04
+  tools:
+    python: "3.11"
+    # You can also specify other tool versions:
+    # nodejs: "20"
+    # rust: "1.70"
+    # golang: "1.20"
+
+# Build documentation in the "docs/" directory with Sphinx
+sphinx:
+  configuration: ./docs/conf.py
+  # You can configure Sphinx to use a different builder, for instance use the dirhtml builder for simpler URLs
+  # builder: "dirhtml"
+  # Fail on all warnings to avoid broken references
+  # fail_on_warning: true
+
+python:
+   install:
+   - requirements: ./requirements/docs.txt
diff --git a/Makefile b/Makefile
@@ -34,3 +34,8 @@ test: ## Run tests
 test-e2e: ## Run end2end tests
 	echo "running end2end tests..."
 	@pytest tests/e2e -s
+
+# Docs
+doc-site: ## Build and serve documentation
+	@sphinx-build -nW --keep-going -j 4 -b html $(GIT_ROOT)/docs/ $(GIT_ROOT)/docs/_build/html
+	@python -m http.server --directory $(GIT_ROOT)/docs/_build/html
diff --git a/README.md b/README.md
@@ -41,9 +41,9 @@
 
 > 🚀 Dedicated solutions and support to improve the reliability of RAG systems in production including custom models for production quality monitoring. Contact founders to learn more. [Talk to founders](https://calendly.com/shahules/30min)
 
-ragas is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines. RAG denotes a class of LLM applications that use external data to augment the LLM’s context. There are existing tools and frameworks that help you build these pipelines but evaluating it and quantifying your pipeline performance can be hard. This is where ragas (RAG Assessment) comes in.
+Ragas is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines. RAG denotes a class of LLM applications that use external data to augment the LLM’s context. There are existing tools and frameworks that help you build these pipelines but evaluating it and quantifying your pipeline performance can be hard. This is where Ragas (RAG Assessment) comes in.
 
-ragas provides you with the tools based on the latest research for evaluating LLM-generated text to give you insights about your RAG pipeline. ragas can be integrated with your CI/CD to provide continuous checks to ensure performance.
+Ragas provides you with the tools based on the latest research for evaluating LLM-generated text to give you insights about your RAG pipeline. Ragas can be integrated with your CI/CD to provide continuous checks to ensure performance.
 
 ## :shield: Installation
 

diff --git a/docs/Makefile b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = source
+BUILDDIR      = build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/_static/css/ragas.css b/docs/_static/css/ragas.css
@@ -0,0 +1,25 @@
+/* Make pandas tables look correct in dark-mode */
+div.cell_output table {
+    color: var(--color-content-foreground);
+}
+
+div.cell_output table {
+    margin: auto;
+}
+
+div.cell_output tbody tr:nth-child(odd):not(:hover) {
+    background: var(--color-table-header-background);
+}
+
+div.cell_output thead {
+    border-bottom-color: var(--color-code-foreground);
+}
+
+div.cell_input {
+    display: none;
+}
+
+.dark {
+    background: var(--color-content-background);
+    color: var(--color-content-foreground);
+}
diff --git a/docs/_static/favicon.ico b/docs/_static/favicon.ico
diff --git a/docs/assets/bar-graph.svg → docs/_static/imgs/bar-graph.svg b/docs/assets/bar-graph.svg → docs/_static/imgs/bar-graph.svg
diff --git a/docs/assets/eval-evolve.png → docs/_static/imgs/eval-evolve.png b/docs/assets/eval-evolve.png → docs/_static/imgs/eval-evolve.png
diff --git a/...ssets/langsmith-tracing-faithfullness.png → .../imgs/langsmith-tracing-faithfullness.png b/...ssets/langsmith-tracing-faithfullness.png → .../imgs/langsmith-tracing-faithfullness.png
diff --git a/docs/assets/langsmith-tracing-overview.png → ...tatic/imgs/langsmith-tracing-overview.png b/docs/assets/langsmith-tracing-overview.png → ...tatic/imgs/langsmith-tracing-overview.png
diff --git a/docs/assets/logo.png → docs/_static/imgs/logo.png b/docs/assets/logo.png → docs/_static/imgs/logo.png
diff --git a/docs/_static/imgs/ragas-logo.png b/docs/_static/imgs/ragas-logo.png
diff --git a/docs/Metrics.ipynb → docs/concepts/Metrics.ipynb b/docs/Metrics.ipynb → docs/concepts/Metrics.ipynb
@@ -41,9 +41,9 @@
  ],
  "metadata": {
   "kernelspec": {
-   "display_name": "ragas",
+   "display_name": "Python 3 (ipykernel)",
    "language": "python",
-   "name": "ragas"
+   "name": "python3"
   },
   "language_info": {
    "codemirror_mode": {
@@ -55,7 +55,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.10.8"
+   "version": "3.10.12"
   }
  },
  "nbformat": 4,

diff --git a/docs/concepts/index.md b/docs/concepts/index.md
@@ -0,0 +1,5 @@
+# Core Concepts
+
+```{toctree}
+metrics.md
+```
diff --git a/docs/metrics.md → docs/concepts/metrics.md b/docs/metrics.md → docs/concepts/metrics.md
diff --git a/docs/conf.py b/docs/conf.py
@@ -0,0 +1,45 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = "ragas"
+copyright = "2023, ExplodingGradients"
+author = "ExplodingGradients"
+release = "0.0.16"
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = [
+    "myst_parser",
+    "sphinx_design",
+    # "myst_parser",
+    "sphinxawesome_theme.highlighting",
+    # "sphinxawesome_theme.docsearch",
+]
+source_suffix = [".rst", ".md"]
+
+templates_path = ["_templates"]
+exclude_patterns = []
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_title = "Ragas"
+html_theme = "sphinxawesome_theme"
+html_static_path = ["_static"]
+html_css_files = ["css/ragas.css"]
+html_favicon = "./_static/favicon.ico"
+
+html_theme_options = {
+    "logo_light": "./_static/imgs/ragas-logo.png",
+    "logo_dark": "./_static/imgs/ragas-logo.png",
+}
+
+# -- Myst NB Config -------------------------------------------------
+nb_execution_mode = "auto"
diff --git a/docs/getstarted/evaluation.md b/docs/getstarted/evaluation.md
@@ -0,0 +1,118 @@
+---
+file_format: mystnb
+kernelspec:
+  name: python3
+execution:
+  timeout: 300
+---
+# Evaluation
+
+welcome to the ragas quickstart. We're going to get you up and running with ragas as qickly as you can so that you can go back to improving your Retrieval Augmented Generation pipelines while this library makes sure your changes are improving your entire pipeline.
+
+to kick things of lets start with the data
+
+```{note}
+Are you using Azure OpenAI endpoints? Then checkout [this quickstart guide](./guides/quickstart-azure-openai.ipynb)
+```
+
+```bash
+pip install ragas
+```
+
+Ragas also uses OpenAI for running some metrics so make sure you have your openai key ready and available in your environment
+```python
+import os
+
+os.environ["OPENAI_API_KEY"] = "your-openai-key"
+```
+## The Data
+
+Ragas performs a `ground_truth` free evaluation of your RAG pipelines. This is because for most people building a gold labeled dataset which represents in the distribution they get in production is a very expensive process.
+
+```{note}
+While originially ragas was aimed at `ground_truth` free evalutions there is some aspects of the RAG pipeline that need `ground_truth` in order to measure. We're in the process of building a testset generation features that will make it easier. Checkout [issue#136](https://github.com/explodinggradients/ragas/issues/136) for more details.
+```
+
+Hence to work with ragas all you need are the following data
+- question: `list[str]` - These are the questions you RAG pipeline will be evaluated on. 
+- answer: `list[str]` - The answer generated from the RAG pipeline and give to the user.
+- contexts: `list[list[str]]` - The contexts which where passed into the LLM to answer the question.
+- ground_truths: `list[list[str]]` - The ground truth answer to the questions. (only required if you are using context_recall)
+
+Ideally your list of questions should reflect the questions your users give, including those that you have been problamatic in the past.
+
+Here we're using an example dataset from on of the baselines we created for the [Financial Opinion Mining and Question Answering (fiqa) Dataset](https://sites.google.com/view/fiqa/) we created. If you want to want to know more about the baseline, feel free to check the `experiements/baseline` section
+
+```{code-cell} python
+# data
+from datasets import load_dataset
+
+fiqa_eval = load_dataset("explodinggradients/fiqa", "ragas_eval")
+fiqa_eval
+```
+
+## Metrics
+
+Ragas provides you with a few metrics to evaluate the different aspects of your RAG systems namely
+
+1. metrics to evaluate retrieval: offers `context_precision` and `context_recall` which give you the measure of the performance of your retrieval system. 
+2. metrics to evaluate generation: offers `faithfulness` which measures hallucinations and `answer_relevancy` which measures how to the point the answers are to the question.
+
+The harmonic mean of these 4 aspects gives you the **ragas score** which is a single measure of the performance of your QA system across all the important aspects.
+
+now lets import these metrics and understand more about what they denote
+
+```{code-cell}
+from ragas.metrics import (
+    answer_relevancy,
+    faithfulness,
+    context_recall,
+    context_precision,
+)
+from ragas.metrics.critique import harmfulness
+```
+here you can see that we are using 4 metrics, but what do the represent?
+
+1. context_precision - a measure of how relevant the retrieved context is to the question. Conveys quality of the retrieval pipeline.
+2. answer_relevancy - a measure of how relevent the answer is to the question
+3. faithfulness - the factual consistancy of the answer to the context base on the question.
+4. context_recall: measures the ability of the retriever to retrieve all the necessary information needed to answer the question. 
+5. harmfulness (AspectCritique) - in general, `AspectCritique` is a metric that can be used to quantify various aspects of the answer. Aspects like harmfulness, maliciousness, coherence, correctness, concisenes are available by default but you can easily define your own. Check the [docs](./metrics.md) for more info.
+
+```{note}
+by default these metrics are using OpenAI's API to compute the score. If you using this metric make sure you set the environment key `OPENAI_API_KEY` with your API key. You can also try other LLMs for evaluation, check the [llm guide](./guides/llms.ipynb) to learn more
+```
+
+If you're interested in learning more, feel free to check the [docs](https://github.com/explodinggradients/ragas/blob/main/docs/metrics.md)
+
+## Evaluation
+
+Running the evalutation is as simple as calling evaluate on the `Dataset` with the metrics of your choice.
+
+```{code-cell}
+from ragas import evaluate
+
+result = evaluate(
+    fiqa_eval["baseline"].select(range(1)),
+    metrics=[
+        context_precision,
+        faithfulness,
+        answer_relevancy,
+        context_recall,
+        harmfulness,
+    ],
+)
+
+result
+```
+and there you have the it, all the scores you need. `ragas_score` gives you a single metric that you can use while the other onces measure the different parts of your pipeline.
+
+now if we want to dig into the results and figure out examples where your pipeline performed worse or really good you can easily convert it into a pandas array and use your standard analytics tools too!
+
+```{code-cell}
+df = result.to_pandas()
+df.head()
+```
+And thats it!
+
+if you have any suggestion/feedbacks/things your not happy about, please do share it in the [issue section](https://github.com/explodinggradients/ragas/issues). We love hearing from you 😁
diff --git a/docs/getstarted/index.md b/docs/getstarted/index.md
@@ -0,0 +1,7 @@
+# Get Started
+
+```{toctree}
+:maxdepth: 1
+install.md
+evaluation.md
+```
diff --git a/docs/getstarted/install.md b/docs/getstarted/install.md
@@ -0,0 +1,20 @@
+# Install
+
+You can install ragas with
+```bash
+pip install ragas
+```
+
+If you want to install the latest version (from the main branch)
+```bash
+pip install git+https://github.com/explodinggradients/ragas.git
+```
+
+If you are looking to contribute and make changes to the code, make sure you
+clone the repo and install it as [editable
+install](https://pip.pypa.io/en/stable/topics/local-project-installs/#editable-installs).
+```bash
+git clone https://github.com/explodinggradients/ragas.git 
+cd ragas 
+pip install -e .
+```
diff --git a/docs/quickstart.ipynb → docs/getstarted/quickstart.ipynb b/docs/quickstart.ipynb → docs/getstarted/quickstart.ipynb
diff --git a/docs/Testsetgeneration.ipynb → docs/getstarted/testsetgeneration.ipynb b/docs/Testsetgeneration.ipynb → docs/getstarted/testsetgeneration.ipynb
diff --git a/docs/guides/data_prep.py b/docs/guides/data_prep.py
diff --git a/...integrations/assets/langsmith-dataset.png → docs/howtos/assets/langsmith-dataset.png b/...integrations/assets/langsmith-dataset.png → docs/howtos/assets/langsmith-dataset.png
diff --git a/...egrations/assets/langsmith-evaluation.png → docs/howtos/assets/langsmith-evaluation.png b/...egrations/assets/langsmith-evaluation.png → docs/howtos/assets/langsmith-evaluation.png
diff --git a/...ntegrations/assets/langsmith-feedback.png → docs/howtos/assets/langsmith-feedback.png b/...ntegrations/assets/langsmith-feedback.png → docs/howtos/assets/langsmith-feedback.png
diff --git a/...ns/assets/langsmith-ragas-chain-trace.png → ...os/assets/langsmith-ragas-chain-trace.png b/...ns/assets/langsmith-ragas-chain-trace.png → ...os/assets/langsmith-ragas-chain-trace.png
diff --git a/docs/guides/llms.ipynb → docs/howtos/customisations/llms.ipynb b/docs/guides/llms.ipynb → docs/howtos/customisations/llms.ipynb
diff --git a/docs/guides/quickstart-azure-openai.ipynb → ...tomisations/quickstart-azure-openai.ipynb b/docs/guides/quickstart-azure-openai.ipynb → ...tomisations/quickstart-azure-openai.ipynb
diff --git a/docs/howtos/index.md b/docs/howtos/index.md
@@ -0,0 +1 @@
+# How-to Guides
diff --git a/docs/integrations/langchain.ipynb → docs/howtos/integrations/langchain.ipynb b/docs/integrations/langchain.ipynb → docs/howtos/integrations/langchain.ipynb
diff --git a/docs/integrations/langsmith.ipynb → docs/howtos/integrations/langsmith.ipynb b/docs/integrations/langsmith.ipynb → docs/howtos/integrations/langsmith.ipynb
diff --git a/docs/integrations/llamaindex.ipynb → docs/howtos/integrations/llamaindex.ipynb b/docs/integrations/llamaindex.ipynb → docs/howtos/integrations/llamaindex.ipynb
diff --git a/docs/integrations/nyc_wikipedia/nyc_text.txt → ...s/integrations/nyc_wikipedia/nyc_text.txt b/docs/integrations/nyc_wikipedia/nyc_text.txt → ...s/integrations/nyc_wikipedia/nyc_text.txt
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,13 @@
+# Welcome
+
+Ragas is a framework that helps you evaluate your Retrieval Augmented Generation (RAG) pipelines. RAG denotes a class of LLM applications that use external data to augment the LLM’s context. There are existing tools and frameworks that help you build these pipelines but evaluating it and quantifying your pipeline performance can be hard. This is where Ragas (RAG Assessment) comes in.
+
+Ragas provides you with the tools based on the latest research for evaluating LLM-generated text to give you insights about your RAG pipeline. Ragas can be integrated with your CI/CD to provide continuous checks to ensure performance.
+
+```{toctree}
+:hidden:
+getstarted/index.md
+concepts/index.md
+howtos/index.md
+references/index.md
+```