Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(upload): changed to task #1178

Merged
merged 1 commit into from
Sep 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 19 additions & 0 deletions backend/celery_task.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from celery import shared_task
from models.brains import Brain
from repository.files.upload_file import DocumentSerializable
from utils.vectors import Neurons


@shared_task
def create_embedding_for_document(
brain_id, doc_with_metadata, user_openai_api_key, file_sha1
):
neurons = Neurons()
doc = DocumentSerializable.from_json(doc_with_metadata)
created_vector = neurons.create_vector(doc, user_openai_api_key)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})

created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none

brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id, file_sha1)
21 changes: 9 additions & 12 deletions backend/parsers/common.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
import time

from langchain.schema import Document
from models import Brain, File
from utils.vectors import Neurons
from celery_task import create_embedding_for_document
from models import File
from repository.files.upload_file import DocumentSerializable


async def process_file(
Expand All @@ -26,15 +26,12 @@ async def process_file(
"date": dateshort,
"summarization": "true" if enable_summarization else "false",
}
doc_with_metadata = Document(page_content=doc.page_content, metadata=metadata)
doc_with_metadata = DocumentSerializable(
page_content=doc.page_content, metadata=metadata
)

neurons = Neurons()
created_vector = neurons.create_vector(doc_with_metadata, user_openai_api_key)
# add_usage(stats_db, "embedding", "audio", metadata={"file_name": file_meta_name,"file_type": ".txt", "chunk_size": chunk_size, "chunk_overlap": chunk_overlap})

created_vector_id = created_vector[0] # pyright: ignore reportPrivateUsage=none

brain = Brain(id=brain_id)
brain.create_brain_vector(created_vector_id, file.file_sha1)
create_embedding_for_document.delay(
brain_id, doc_with_metadata.to_json(), user_openai_api_key, file.file_sha1
)

return
35 changes: 35 additions & 0 deletions backend/repository/files/upload_file.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import json
from multiprocessing import get_logger

from httpx import Response
from langchain.pydantic_v1 import Field
from langchain.schema import Document
from models import get_supabase_client
from supabase.client import Client

Expand All @@ -19,3 +22,35 @@ def upload_file_storage(file, file_identifier: str) -> Response:
logger.error(e)
print(e)
return response


class DocumentSerializable(Document):
"""Class for storing a piece of text and associated metadata."""

page_content: str
metadata: dict = Field(default_factory=dict)

@property
def lc_serializable(self) -> bool:
return True

def __repr__(self):
return f"Document(page_content='{self.page_content[:50]}...', metadata={self.metadata})"

def __str__(self):
return self.__repr__()

def to_json(self) -> str:
"""Convert the Document object to a JSON string."""
return json.dumps(
{
"page_content": self.page_content,
"metadata": self.metadata,
}
)

@classmethod
def from_json(cls, json_str: str):
"""Create a Document object from a JSON string."""
data = json.loads(json_str)
return cls(page_content=data["page_content"], metadata=data["metadata"])