-
Notifications
You must be signed in to change notification settings - Fork 2.1k
feat: Knowledge base generation problem #2760
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -23,6 +23,7 @@ | |
from django.core import validators | ||
from django.db import transaction, models | ||
from django.db.models import QuerySet | ||
from django.db.models.functions import Reverse, Substr | ||
from django.http import HttpResponse | ||
from drf_yasg import openapi | ||
from rest_framework import serializers | ||
|
@@ -42,9 +43,10 @@ | |
from dataset.models.data_set import DataSet, Document, Paragraph, Problem, Type, ProblemParagraphMapping, TaskType, \ | ||
State, File, Image | ||
from dataset.serializers.common_serializers import list_paragraph, MetaSerializer, ProblemParagraphManage, \ | ||
get_embedding_model_by_dataset_id, get_embedding_model_id_by_dataset_id, write_image, zip_dir | ||
get_embedding_model_by_dataset_id, get_embedding_model_id_by_dataset_id, write_image, zip_dir, \ | ||
GenerateRelatedSerializer | ||
from dataset.serializers.document_serializers import DocumentSerializers, DocumentInstanceSerializer | ||
from dataset.task import sync_web_dataset, sync_replace_web_dataset | ||
from dataset.task import sync_web_dataset, sync_replace_web_dataset, generate_related_by_dataset_id | ||
from embedding.models import SearchMode | ||
from embedding.task import embedding_by_dataset, delete_embedding_by_dataset | ||
from setting.models import AuthOperate, Model | ||
|
@@ -814,6 +816,31 @@ def re_embedding(self, with_valid=True): | |
except AlreadyQueued as e: | ||
raise AppApiException(500, _('Failed to send the vectorization task, please try again later!')) | ||
|
||
def generate_related(self, instance: Dict, with_valid=True): | ||
if with_valid: | ||
self.is_valid(raise_exception=True) | ||
GenerateRelatedSerializer(data=instance).is_valid(raise_exception=True) | ||
dataset_id = self.data.get('id') | ||
model_id = instance.get("model_id") | ||
prompt = instance.get("prompt") | ||
state_list = instance.get('state_list') | ||
ListenerManagement.update_status(QuerySet(Document).filter(dataset_id=dataset_id), | ||
TaskType.GENERATE_PROBLEM, | ||
State.PENDING) | ||
ListenerManagement.update_status(QuerySet(Paragraph).annotate( | ||
reversed_status=Reverse('status'), | ||
task_type_status=Substr('reversed_status', TaskType.GENERATE_PROBLEM.value, | ||
1), | ||
).filter(task_type_status__in=state_list, dataset_id=dataset_id) | ||
.values('id'), | ||
TaskType.GENERATE_PROBLEM, | ||
State.PENDING) | ||
ListenerManagement.get_aggregation_document_status_by_dataset_id(dataset_id)() | ||
try: | ||
generate_related_by_dataset_id.delay(dataset_id, model_id, prompt, state_list) | ||
except AlreadyQueued as e: | ||
raise AppApiException(500, _('Failed to send the vectorization task, please try again later!')) | ||
|
||
def list_application(self, with_valid=True): | ||
if with_valid: | ||
self.is_valid(raise_exception=True) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The code you provided contains some general improvements and optimizations that can be made:
Here’s an improved version of the relevant section of code with these suggestions: from django.core import validators
from django.db import transaction, models
from django.db.models import QuerySet
from django.db.models.functions import Reverse, Substr
from drf_yasg import openapi
from rest_framework import serializers
from dataset.models import Dataset, Document, Paragraph, Problem, Type, ProblemParagraphMapping, TaskType, \
State, File, Image
from dataset.serializers.common_serializers import list_paragraph, MetaSerializer, ProblemParagraphManage, \
get_embedding_model_by_dataset_id, get_embedding_model_id_by_dataset_id, write_image, zip_dir
from dataset.serializers.document_serializers import DocumentSerializers, DocumentInstanceSerializer
from.dataset.task import sync_web_dataset, sync_replace_web_dataset, generate_related_by_dataset_id
from embedding.models import SearchMode
from embedding.task import embedding_by_dataset, delete_embedding_by_dataset
from setting.models import AuthOperate, Model
class YourClassName(serializers.Serializer):
data = serializers.DictField(required=True)
class Meta:
fields = ('data')
def re_embedding(self, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
dataset_id = self.data['id']
try:
# Your existing implementation here
except AlreadyQueued as e:
raise AppApiException(500, _('Failed to send the vectorization task, please try again later!'))
def generate_related(self, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True)
GenerateRelatedSerializer.data = self.validated_data
dataset_id = self.validated_data.get('id')
model_id = self.validated_data.get("model_id")
prompt = self.validated_data.get("prompt")
state_list = [s.lower() for s in self.validated_data.get('state_list')]
ListenerManagement.update_status(
queryset=Document.objects.filter(dataset_id=dataset_id),
task_type=TaskType.GENERATE_PROBLEM,
new_state=State.PENDING
)
ListenerManagement.update_status(
queryset=(
Paragraph.objects.annotate(
status_reverse=Reverse('status'),
task_type_status=Substr('status_reverse', TaskType.GENERATE_PROBLEM.value, 1)
)
.filter(task_type_status='pending', dataset_id=dataset_id)
.values('id')
),
task_type=TaskType.GENERATE_PROBLEM,
new_state=State.PENDING
)
ListenerManagement.get_aggregation_document_status_by_dataset_id(dataset_id)()
try:
generate_related_by_dataset_id.delay(dataset_id, model_id, prompt, tuple(state_list))
except AlreadyQueued as e:
raise AppApiException(500, _('Failed to send the vectorization task, please try again later!'))
def list_application(self, with_valid=True):
if with_valid:
self.is_valid(raise_exception=True) Make sure to adapt the class name ( |
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -21,6 +21,7 @@ | |
from common.response import result | ||
from common.response.result import get_page_request_params, get_page_api_response, get_api_response | ||
from common.swagger_api.common_api import CommonApi | ||
from dataset.serializers.common_serializers import GenerateRelatedSerializer | ||
from dataset.serializers.dataset_serializers import DataSetSerializers | ||
from dataset.views.common import get_dataset_operation_object | ||
from setting.serializers.provider_serializers import ModelSerializer | ||
|
@@ -173,6 +174,23 @@ def put(self, request: Request, dataset_id: str): | |
return result.success( | ||
DataSetSerializers.Operate(data={'id': dataset_id, 'user_id': request.user.id}).re_embedding()) | ||
|
||
class GenerateRelated(APIView): | ||
authentication_classes = [TokenAuth] | ||
|
||
@action(methods=['PUT'], detail=False) | ||
@swagger_auto_schema(operation_summary=_('Generate related'), operation_id=_('Generate related'), | ||
manual_parameters=DataSetSerializers.Operate.get_request_params_api(), | ||
request_body=GenerateRelatedSerializer.get_request_body_api(), | ||
tags=[_('Knowledge Base')] | ||
) | ||
@log(menu='document', operate="Generate related documents", | ||
get_operation_object=lambda r, keywords: get_dataset_operation_object(keywords.get('dataset_id')) | ||
) | ||
def put(self, request: Request, dataset_id: str): | ||
return result.success( | ||
DataSetSerializers.Operate(data={'id': dataset_id, 'user_id': request.user.id}).generate_related( | ||
request.data)) | ||
|
||
class Export(APIView): | ||
authentication_classes = [TokenAuth] | ||
|
||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The provided code is already well-structured and does not seem to have any obvious issues. However, there are a few minor suggestions for optimization:
Here's an updated version of the code with some of these improvements: from django.http import HttpResponse
from rest_framework.response import Response
from django.utils.decorators import method_decorator
# Import necessary modules here for improved readability
class DataSetView(APIView):
# ... (rest of the codes remains unchanged)
class GenerateRelated(APIView):
authentication_classes = [TokenAuth]
@action(methods=['PUT'], detail=False)
@swagger_auto_schema(
operation_summary=_('Generate Related Documents'),
operation_id=_('Generate Related Documents'),
manual_parameters=[DataSetSerializers.Operate.get_request_params_api()],
request_body=GenerateRelatedSerializer.get_request_body_api(),
tags=[_('Knowledge Base')]
)
@log(menu='document', operate="Generate related documents",
get_operation_object=lambda r, keywords: get_dataset_operation_object(keywords.get('dataset_id')))
def put(self, request: Request, dataset_id: str) -> HttpResponse:
data = {
'id': dataset_id,
'user_id': request.user.id
}
try:
return response.success(DataSetSerializers.Execute(data=data).generate_related(request.data))
except Exception as e:
logger.error(f"Failed to generate related documents for dataset {dataset_id}: {e}")
return HttpResponse("An error occurred while generating related documents.", status=500)
class Export(APIView):
authentication_classes = [TokenAuth]
# ... (rest of the codes remains unchanged) These changes aim to make the code cleaner and potentially easier to maintain or update in the future. Let me know if further assistance is needed! |
||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The provided code seems to be well-written and follows Python best practices. However, there are a few minor improvements that can be made:
Type Hinting: The use of
List
from thetyping
module is recommended for better readability and clarity.Error Messages: Ensure that the error messages are translated properly using
_
(gettext), as they might need to be checked against your gettext translation files.API Documentation: The
generate_related_serializer.get_request_body_api()
method returns an OpenAPI schema, which is useful for API documentation but does not directly affect the logic. It should be documented within comments if necessary.# Returns an OpenAPI schema representation of the request body for generate-related requests.
Here's the updated version with some additional comments:
Summary of Recommendations:
List[str]
instead of justList
.