Merge pull request #14 from pragunbhutani/development

feat: migrating the project to use poetry
pragunbhutani · Apr 2, 2024 · 4f76641 · 4f76641
2 parents 39c939f + 9432058
commit 4f76641
Show file tree

Hide file tree

Showing 31 changed files with 4,629 additions and 174 deletions.
diff --git a/.github/workflows/lint-and-test.yml b/.github/workflows/lint-and-test.yml
@@ -0,0 +1,40 @@
+name: Lint and Test
+
+on:
+  push:
+    branches:
+      - main
+  pull_request: {}
+
+permissions:
+  contents: read
+
+jobs:
+  lint:
+    name: Lint and test
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install Poetry
+        run: pipx install poetry
+
+      - name: Setup Python
+        uses: actions/setup-python@v5
+        with:
+          python-version: "3.10"
+          cache: "poetry"
+
+      - name: Install dev dependencies
+        run: poetry install
+
+      - name: Lint with Flake8
+        run: poetry run flake8 dbt_llm_tools tests
+
+      - name: Lint with pylint
+        run: poetry run pylint dbt_llm_tools tests
+
+      - name: Test with pytest
+        run: poetry run pytest tests
diff --git a/.github/workflows/python-publish.yml b/.github/workflows/python-publish.yml
@@ -1,48 +1,37 @@
-# This workflow will upload a Python Package using Twine when a release is created
-# For more information see: https://docs.github.com/en/actions/automating-builds-and-tests/building-and-testing-python#publishing-to-package-registries
-
-# This workflow uses actions that are not certified by GitHub.
-# They are provided by a third-party and are governed by
-# separate terms of service, privacy policy, and support
-# documentation.
-
 name: Upload Python Package
 
 on:
-  push:
-    branches: [main, staging]
+  release:
+    types:
+      - published
 
 permissions:
   contents: read
 
 jobs:
-  deploy:
+  release:
+    name: Release to PyPI
     runs-on: ubuntu-latest
-    environment: ${{ github.ref_name }}
 
     steps:
-      - uses: actions/checkout@v4
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install Poetry
+        run: pipx install poetry
+
       - name: Set up Python
         uses: actions/setup-python@v5
         with:
           python-version: "3.10"
-      - name: Install dependencies
-        run: |
-          python -m pip install --upgrade pip
-          python -m pip install -r requirements.txt
-          pip install setuptools wheel twine
-      - name: Lint with pylint
-        run: |
-          pylint $(git ls-files '*.py')
-      - name: Test with pytest
-        run: |
-          python -m unittest
-      - name: Build and publish
+          cache: "poetry"
+
+      - name: Build
+        run: poetry build
+
+      - name: Publish
         env:
-          TWINE_USERNAME: __token__
-          TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-          REPOSITORY_URL: ${{ vars.REPOSITORY_URL }}
+          PYPI_TOKEN: ${{ secrets.PYPI_TOKEN }}
         run: |
-          python setup.py sdist bdist_wheel
-          echo $REPORSITORY_URL
-          twine upload --repository-url $REPOSITORY_URL dist/*
+          poetry config pypi-token.pypi $PYPI_TOKEN
+          poetry publish
diff --git a/.gitignore b/.gitignore
@@ -82,7 +82,9 @@ docs/_templates/
 target/
 
 # Jupyter Notebook
-.ipynb_checkpoints
+**/*.ipynb_checkpoints
+**/*.ipynb
+notebooks/*
 
 # IPython
 profile_default/
@@ -197,9 +199,12 @@ pyrightconfig.json
 # End of https://www.toptal.com/developers/gitignore/api/python,visualstudiocode
 
 ### Vector Store Instances ###
-chroma.db
+**/**chroma.db
 test_chroma.db
 .database/*
 
 ### Mac
-.DS_Store
+.DS_Store
+
+# Sphinx documentation
+docs/_build/
diff --git a/.pylintrc b/.pylintrc
diff --git a/README.md b/README.md
@@ -1,8 +1,8 @@
-# Ragstar - LLM Tools for DBT Projects
+# dbt-llm-tools
 
-Ragstar (inspired by `RAG & select *`) is set of LLM powered tools to elevate your dbt projects and supercharge your data team.
+LLM based tools for dbt projects. Answer data questions, generate documentation and more.
 
-These tools include:
+Currently Includes:
 
 - Chatbot: ask questions about data and get answers based on your dbt model documentation
 - Documentation Generator: generate documentation for dbt models based on model and upstream model definition.
@@ -11,18 +11,18 @@ These tools include:
 
 ### Installation
 
-Ragstar can be installed via pip.
+dbt-llm-tools can be installed via pip.
 
 ```
-pip install ragstar
+pip install dbt-llm-tools
 ```
 
 ## Basic Usage - Chatbot
 
 How to load your dbt project into the Chatbot and ask questions about your data.
 
 ```Python
-from ragstar import Chatbot
+from dbt_llm_tools import Chatbot
 
 # Instantiate a chatbot object
 chatbot = Chatbot(
@@ -40,14 +40,14 @@ response = chatbot.ask_question(
 print(response)
 ```
 
-**Note**: Ragstar currently only supports OpenAI ChatGPT models for generating embeddings and responses to queries.
+**Note**: dbt-llm-tools currently only supports OpenAI ChatGPT models for generating embeddings and responses to queries.
 
 ### How it works
 
-Ragstar is based on the concept of Retrieval Augmented Generation and basically works as follows:
+The Chatbot is based on the concept of Retrieval Augmented Generation and basically works as follows:
 
-- When you call the `chatbot.load_models()` method, Ragstar scans all the folders in the locations specified by you for dbt YML files.
-- It then converts all the models into a text description, which are stored as embeddings in a vector database. Ragstar currently only supports [ChromaDB](https://www.trychroma.com/) as a vector db, which is persisted in a file on your local machine.
+- When you call the `chatbot.load_models()` method, the bot scans all the folders in the locations specified by you for dbt YML files.
+- It then converts all the models into a text description, which are stored as embeddings in a vector database. The bot currently only supports [ChromaDB](https://www.trychroma.com/) as a vector db, which is persisted in a file on your local machine.
 - When you ask a query, it fetches 3 models whose description is found to be the most relevant for your query.
 - These models are then fed into ChatGPT as a prompt, along with some basic instructions and your question.
 - The response is returned to you as a string.
@@ -57,7 +57,7 @@ Ragstar is based on the concept of Retrieval Augmented Generation and basically
 How to load your dbt project into the Documentation Generator and have it write documentation for your models.
 
 ```Python
-from ragstar import DocumentationGenerator
+from dbt_llm_tools import DocumentationGenerator
 
 # Instantiate a Documentation Generator object
 doc_gen = DocumentationGenerator(

diff --git a/dbt_llm_tools/__init__.py b/dbt_llm_tools/__init__.py
@@ -0,0 +1,15 @@
+from dbt_llm_tools.chatbot import Chatbot
+from dbt_llm_tools.dbt_model import DbtModel
+from dbt_llm_tools.dbt_project import DbtProject
+from dbt_llm_tools.documentation_generator import DocumentationGenerator
+from dbt_llm_tools.instructions import (
+    ANSWER_QUESTION_INSTRUCTIONS,
+    INTERPRET_MODEL_INSTRUCTIONS,
+)
+from dbt_llm_tools.types import (
+    DbtModelDict,
+    DbtModelDirectoryEntry,
+    ParsedSearchResult,
+    PromptMessage,
+)
+from dbt_llm_tools.vector_store import VectorStore
diff --git a/ragstar/chatbot.py → dbt_llm_tools/chatbot.py b/ragstar/chatbot.py → dbt_llm_tools/chatbot.py
@@ -1,10 +1,9 @@
 from openai import OpenAI
 
-from ragstar.types import PromptMessage, ParsedSearchResult
-
-from ragstar.instructions import ANSWER_QUESTION_INSTRUCTIONS
-from ragstar.dbt_project import DbtProject
-from ragstar.vector_store import VectorStore
+from dbt_llm_tools.dbt_project import DbtProject
+from dbt_llm_tools.instructions import ANSWER_QUESTION_INSTRUCTIONS
+from dbt_llm_tools.types import ParsedSearchResult, PromptMessage
+from dbt_llm_tools.vector_store import VectorStore
 
 
 class Chatbot:

diff --git a/ragstar/dbt_model.py → dbt_llm_tools/dbt_model.py b/ragstar/dbt_model.py → dbt_llm_tools/dbt_model.py
@@ -1,5 +1,6 @@
 from typing import Callable
-from ragstar.types import DbtModelDict
+
+from dbt_llm_tools.types import DbtModelDict
 
 
 class DbtModel:

diff --git a/ragstar/dbt_project.py → dbt_llm_tools/dbt_project.py b/ragstar/dbt_project.py → dbt_llm_tools/dbt_project.py
@@ -1,14 +1,13 @@
-import os
 import glob
-import re
 import json
-
+import os
+import re
 from typing import Union
 
 import yaml
 
-from ragstar.types import DbtModelDirectoryEntry, DbtProjectDirectory
-from ragstar.dbt_model import DbtModel
+from dbt_llm_tools.dbt_model import DbtModel
+from dbt_llm_tools.types import DbtModelDirectoryEntry, DbtProjectDirectory
 
 SOURCE_SEARCH_EXPRESSION = r"source\(['\"]*(.*?)['\"]*?\)"
 REF_SEARCH_EXPRESSION = r"ref\(['\"]*(.*?)['\"]*\)"

diff --git a/ragstar/documentation_generator.py → dbt_llm_tools/documentation_generator.py b/ragstar/documentation_generator.py → dbt_llm_tools/documentation_generator.py
@@ -1,12 +1,12 @@
-import os
 import json
-import yaml
+import os
 
+import yaml
 from openai import OpenAI
 
-from ragstar.types import DbtModelDict, DbtModelDirectoryEntry, PromptMessage
-from ragstar.instructions import INTERPRET_MODEL_INSTRUCTIONS
-from ragstar.dbt_project import DbtProject
+from dbt_llm_tools.dbt_project import DbtProject
+from dbt_llm_tools.instructions import INTERPRET_MODEL_INSTRUCTIONS
+from dbt_llm_tools.types import DbtModelDict, DbtModelDirectoryEntry, PromptMessage
 
 
 class MyDumper(yaml.Dumper):  # pylint: disable=too-many-ancestors
@@ -153,7 +153,7 @@ def interpret_model(self, model: DbtModelDirectoryEntry) -> DbtModelDict:
                 self.__get_system_prompt(
                     f"""
 
-                    The model {model["name"]} references the following models: {", ".join(refs)}.               
+                    The model {model["name"]} references the following models: {", ".join(refs)}.
                     The interpretation for each of these models is as follows:
                     """
                 )

diff --git a/ragstar/instructions.py → dbt_llm_tools/instructions.py b/ragstar/instructions.py → dbt_llm_tools/instructions.py
@@ -1,41 +1,43 @@
 INTERPRET_MODEL_INSTRUCTIONS = r"""
-    You are a data analyst trying to understand the meaning and schema of a dbt model. 
+    You are a data analyst trying to understand the meaning and schema of a dbt model.
     You will be provided with the name of the model and the Jinja SQL code that defines the model.
 
     The Jinja files may contain references to other models, using the \{\{ ref('model_name') \}\} syntax,
     or references to source tables using the \{\{ source('schema_name', 'table_name') \}\} syntax.
-    
-    The interpretation for all upstream models will be provided to you in the form of a 
+
+    The interpretation for all upstream models will be provided to you in the form of a
     JSON object that contains the following keys: model, description, columns.
 
-    A source table is a table that is not defined in the dbt project, but is instead a table that is present in the data warehouse.
+    A source table is a table that is not defined in the dbt project, but is instead a table that is present in the
+    data warehouse.
 
     Your response should be in the form of a JSON object that contains the following keys: model, description, columns.
 
-    The columns key should contain a list of JSON objects, each of which should contain 
+    The columns key should contain a list of JSON objects, each of which should contain
     the following keys: name, description.
 
     Your response should only contain an unformatted JSON string described above and nothing else.
 """
 
 ANSWER_QUESTION_INSTRUCTIONS = r"""
-    You are a data analyst working with a data warehouse. You should provide the user with the information 
+    You are a data analyst working with a data warehouse. You should provide the user with the information
     they need to answer their question.
-    
-    You should only provide information that you are confident is correct. When you are not sure about the answer, 
+
+    You should only provide information that you are confident is correct. When you are not sure about the answer,
     you should let the user know.
 
-    If you are able to construct a SQL query that would answer the user's question, you should do so. However 
-    please refrain from doing so if the user's question is ambiguous or unclear. When writing a SQL query, 
-    you should only use column values if these values have been explicitly provided to you in the information 
-    you have been given. 
-    
-    Do not write a SQL query if you are unsure about the correctness of the query or about the values contained 
-    in the columns. Only write a SQL query if you are confident that the query is exhaustive and that it will 
-    return the correct results. If it is not possible to write a SQL that fulfils these conditions, 
-    you should instead respond with the names of the tables or columns that you think are relevant to the user's question.
-
-    You should also refrain from providing any information that is not directly related to the user's question or that 
+    If you are able to construct a SQL query that would answer the user's question, you should do so. However
+    please refrain from doing so if the user's question is ambiguous or unclear. When writing a SQL query,
+    you should only use column values if these values have been explicitly provided to you in the information
+    you have been given.
+
+    Do not write a SQL query if you are unsure about the correctness of the query or about the values contained
+    in the columns. Only write a SQL query if you are confident that the query is exhaustive and that it will
+    return the correct results. If it is not possible to write a SQL that fulfils these conditions,
+    you should instead respond with the names of the tables or columns that you think are relevant to the user's
+    question.
+
+    You should also refrain from providing any information that is not directly related to the user's question or that
     which cannot be inferred from the information you have been given.
 
     The following information about tables and columns is available to you:

diff --git a/ragstar/types.py → dbt_llm_tools/types.py b/ragstar/types.py → dbt_llm_tools/types.py
@@ -1,4 +1,5 @@
 from typing import TypedDict, Union
+
 from typing_extensions import NotRequired
 
 

diff --git a/ragstar/vector_store.py → dbt_llm_tools/vector_store.py b/ragstar/vector_store.py → dbt_llm_tools/vector_store.py
@@ -3,8 +3,8 @@
 import chromadb
 from chromadb.utils import embedding_functions
 
-from ragstar.types import ParsedSearchResult
-from ragstar.dbt_model import DbtModel
+from dbt_llm_tools.dbt_model import DbtModel
+from dbt_llm_tools.types import ParsedSearchResult
 
 
 class VectorStore: