Skip to content

Commit

Permalink
Finalize defichain python integration
Browse files Browse the repository at this point in the history
  • Loading branch information
0ptim committed Oct 15, 2023
1 parent 7314660 commit 7ced8f0
Show file tree
Hide file tree
Showing 8 changed files with 105 additions and 22 deletions.
27 changes: 27 additions & 0 deletions .github/workflows/defichain_python_scraping_production.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
name: Defichain Python scraping Production

on:
workflow_dispatch:

jobs:
run_script:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
working-directory: ./job

- name: Run the script
run: python ./job/defichainpython_embedding.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
SUPABASE_KEY: ${{ secrets.PRODUCTION_SUPABASE_API_ANON_KEY }}
30 changes: 30 additions & 0 deletions .github/workflows/defichain_python_scraping_staging.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
name: Defichain Python scraping Staging

on:
pull_request:
branches:
- main
workflow_dispatch:

jobs:
run_script:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2
- name: Set up Python
uses: actions/setup-python@v2
with:
python-version: 3.8

- name: Install dependencies
run: |
python -m pip install --upgrade pip
pip install -r requirements.txt
working-directory: ./job

- name: Embeddings for DefichainPython
run: python ./job/defichainpython_embedding.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
2 changes: 1 addition & 1 deletion .github/workflows/wiki_scraping_production.yml
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ jobs:
working-directory: ./job

- name: Run the script
run: python ./job/app.py
run: python ./job/wiki_embedding.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.PRODUCTION_SUPABASE_API_URL }}
Expand Down
7 changes: 0 additions & 7 deletions .github/workflows/wiki_scraping_staging.yml
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,3 @@ jobs:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}

- name: Embeddings for DefichainPython
run: python ./job/defichainpython_embedding.py
env:
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
SUPABASE_URL: ${{ vars.STAGING_SUPABASE_API_URL }}
SUPABASE_KEY: ${{ secrets.STAGING_SUPABASE_API_ANON_KEY }}
7 changes: 4 additions & 3 deletions backend/tools/defichainpython_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,8 @@
# Set debug to True to see A LOT of details of langchain's inner workings
# langchain.debug = True

# The name of the table in Supabase, where the vectors are stored
vectorTableName = "embeddings"
# The name of the function in Supabase which is used to match the embeddings
matchVectorFunctionName = "match_embeddings_defichain_python"

# Create the supabase client
SUPABASE_URL = os.getenv("SUPABASE_URL")
Expand Down Expand Up @@ -61,7 +61,7 @@ def get_answer(question: str) -> str:
try:
vectors = OpenAIEmbeddings().embed_documents([question])
embeddings = supabase.rpc(
"match_embeddings", dict(query_embedding=vectors[0], match_count=7)
matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
).execute()

print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
Expand Down Expand Up @@ -93,6 +93,7 @@ def get_answer(question: str) -> str:
args_schema=ToolInputSchema,
)


if __name__ == "__main__":
while True:
question = input(
Expand Down
11 changes: 5 additions & 6 deletions backend/tools/wiki_qa.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,14 +15,13 @@
)
import langchain


load_dotenv()

# Set debug to True to see A LOT of details of langchain's inner workings
# langchain.debug = True

# The name of the table in Supabase, where the vectors are stored
vectorTableName = "embeddings"
matchVectorFunctionName = "match_embeddings"

# Create the supabase client
SUPABASE_URL = os.getenv("SUPABASE_URL")
Expand All @@ -42,12 +41,12 @@ class KnowledgeAnswer(BaseModel):
)


llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.7)
llm = ChatOpenAI(model_name="gpt-3.5-turbo-16k", temperature=0.3)

prompt_msgs = [
SystemMessagePromptTemplate.from_template(
"""You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance.
"""You're an elite algorithm, answering queries based solely on given context. If the context lacks the answer, state ignorance. If you are not 100% sure tell the user.
Context:
{context}"""
),
Expand All @@ -62,7 +61,7 @@ def get_answer(question: str) -> str:
try:
vectors = OpenAIEmbeddings().embed_documents([question])
embeddings = supabase.rpc(
"match_embeddings", dict(query_embedding=vectors[0], match_count=7)
matchVectorFunctionName, dict(query_embedding=vectors[0], match_count=7)
).execute()

print(f"⚡ Retrieved {len(embeddings.data)} vectors from Supabase:")
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
-- Create a table to store embeddings
create table embeddings_defichain_python (
id UUID primary key,
content text, -- corresponds to Document.pageContent
metadata jsonb, -- corresponds to Document.metadata
embedding vector(1536) -- 1536 works for OpenAI embeddings, change if needed
);

-- Create a function to search for embeddings
create function match_embeddings_defichain_python (
query_embedding vector(1536),
match_count int default null,
filter jsonb DEFAULT '{}'
) returns table (
id uuid,
content text,
metadata jsonb,
similarity float
)
language plpgsql
as $$
#variable_conflict use_column
begin
return query
select
id,
content,
metadata,
1 - (embeddings_defichain_python.embedding <=> query_embedding) as similarity
from embeddings_defichain_python
where metadata @> filter
order by embeddings_defichain_python.embedding <=> query_embedding
limit match_count;
end;
$$;
8 changes: 3 additions & 5 deletions job/defichainpython_embedding.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

load_dotenv()

vectorTableName = "embeddings"
vectorTableName = "embeddings_defichain_python"
scrapeUrls = ["https://docs.defichain-python.de/build/html/sitemap.xml"]
embedding_model = "text-embedding-ada-002"

Expand All @@ -29,9 +29,7 @@
print("🔎 Found %s unique pages" % len(urls))

# Remove urls
remove_urls = (
"https://docs.defichain-python.de/build/html/search.html"
)
remove_urls = "https://docs.defichain-python.de/build/html/search.html"

urls = [url for url in urls if url not in remove_urls]

Expand Down Expand Up @@ -60,7 +58,7 @@

# Split the documents in chunks for upload (Did time out when too large).
docs_chunks = [
docs[x: x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
docs[x : x + upload_chunk_size] for x in range(0, len(docs), upload_chunk_size)
]

# Iterate over each chunk and upload separately.
Expand Down

0 comments on commit 7ced8f0

Please sign in to comment.