Skip to content

Commit 2808679

Browse files
committed
refactor(artifact): simplify chunk retrieval and align chunk type naming with protobuf (#270)
Because - The `GetChunksByFile` function was returning an unnecessary `map[types.TextChunkUIDType]string` that duplicated data already available through parallel arrays (`textChunks` and `texts`), adding complexity and memory overhead - The codebase was using the legacy string value `"chunk"` for content chunks, which was inconsistent with the protobuf enum rename from `TYPE_CHUNK` to `TYPE_CONTENT` - Database records stored `chunk_type = "chunk"` instead of `"content"`, causing a mismatch between the API layer (using `TYPE_CONTENT`) and the persistence layer - Comments and documentation still referenced the old "chunk" terminology instead of "content" This commit **Simplifies chunk retrieval:** - Removes the unnecessary `map[types.TextChunkUIDType]string` return value from `GetChunksByFile` service method - Updates `GetFileCatalogResponse` handler to use direct array indexing (`texts[i]`) instead of map lookup (`chunkUIDToContent[chunk.UID]`) - Simplifies the service interface signature from 6 return values to 5 - Improves performance by eliminating map allocation and lookup overhead **Aligns chunk type naming across all layers:** - Updates database migration 032 to include data migration: `UPDATE text_chunk/embedding SET chunk_type = 'content' WHERE chunk_type = 'chunk'` - Updates all enum-to-string conversions in worker activities (`SaveTextChunksActivity`, `ProcessContentActivity`) to use `"content"` instead of `"chunk"` - Updates all string-to-enum conversions in handlers (`convertToProtoChunk`) and services (`SimilarityChunksSearch`) to map `"content"` to `TYPE_CONTENT` - Updates sorting logic in `ListChunks` handler to use `"content"` priority and correct field reference (`ChunkType` instead of `ContentType`) - Updates 20+ test cases across `process_file_activity_test.go`, `embed_activity_test.go`, and `embed_workflow_test.go` - Updates documentation comments in repository models to reflect "content/summary/augmented" terminology
1 parent 89d528f commit 2808679

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

64 files changed

+5931
-2778
lines changed

Makefile

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -98,7 +98,10 @@ integration-test: ## Run integration test
9898
integration-test/rest.js --no-usage-report
9999
@TEST_FOLDER_ABS_PATH=${PWD} k6 run \
100100
-e API_GATEWAY_PROTOCOL=${API_GATEWAY_PROTOCOL} -e API_GATEWAY_URL=${API_GATEWAY_URL} \
101-
integration-test/file-type.js --no-usage-report
101+
integration-test/rest-file-type.js --no-usage-report
102+
@TEST_FOLDER_ABS_PATH=${PWD} k6 run \
103+
-e API_GATEWAY_PROTOCOL=${API_GATEWAY_PROTOCOL} -e API_GATEWAY_URL=${API_GATEWAY_URL} \
104+
integration-test/rest-db.js --no-usage-report
102105

103106
.PHONY: help
104107
help: ## Show this help

cmd/worker/main.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -198,10 +198,8 @@ func main() {
198198
w.RegisterActivity(cw.ProcessSummaryActivity) // Complete summary processing: generate → save to DB
199199

200200
// Chunking Phase - Combined content and summary chunking (sequential after parallel AI operations)
201-
w.RegisterActivity(cw.GetConvertedFileForChunkingActivity) // Retrieve converted markdown for chunking
202-
w.RegisterActivity(cw.ChunkContentActivity) // Split markdown content into semantic chunks
203-
w.RegisterActivity(cw.SaveTextChunksActivity) // Persist chunks to database and MinIO storage
204-
w.RegisterActivity(cw.UpdateChunkingMetadataActivity) // Update file status and chunking metadata
201+
w.RegisterActivity(cw.ChunkContentActivity) // Split markdown content into semantic chunks
202+
w.RegisterActivity(cw.SaveTextChunksActivity) // Persist chunks to database and MinIO storage
205203

206204
// Embedding Phase - Vector embedding generation and storage
207205
w.RegisterActivity(cw.GetChunksForEmbeddingActivity) // Retrieve text chunks for embedding

go.mod

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ require (
1313
github.com/google/go-cmp v0.7.0
1414
github.com/grpc-ecosystem/go-grpc-middleware v1.4.0
1515
github.com/grpc-ecosystem/grpc-gateway/v2 v2.27.1
16-
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20251015172110-88f0bc59c769
16+
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20251016131944-fbf46a3f350f
1717
github.com/instill-ai/usage-client v0.4.0
1818
github.com/instill-ai/x v0.10.0-alpha
1919
github.com/knadh/koanf v1.5.0

go.sum

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -301,8 +301,8 @@ github.com/hpcloud/tail v1.0.0/go.mod h1:ab1qPbhIpdTxEkNHXyeSf5vhxWSCs/tWer42PpO
301301
github.com/hydrogen18/memlistener v0.0.0-20200120041712-dcc25e7acd91/go.mod h1:qEIFzExnS6016fRpRfxrExeVn2gbClQA99gQhnIcdhE=
302302
github.com/imkira/go-interpol v1.1.0/go.mod h1:z0h2/2T3XF8kyEPpRgJ3kmNv+C43p+I/CoI+jC3w2iA=
303303
github.com/inconshreveable/mousetrap v1.0.0/go.mod h1:PxqpIevigyE2G7u3NXJIT2ANytuPF1OarO4DADm73n8=
304-
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20251015172110-88f0bc59c769 h1:Q0LZdwBaNnPADWtgQMzU/2CgoeHdSU8HoVtxs0Cx99k=
305-
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20251015172110-88f0bc59c769/go.mod h1:bCnBosofpaUxKBuTTJM3/I3thAK37kvfBnKByjnLsl4=
304+
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20251016131944-fbf46a3f350f h1:VZKFHipeJ3Vz9lpub0MIkLQpN0K5gCqFh8f31OJh4I0=
305+
github.com/instill-ai/protogen-go v0.3.3-alpha.0.20251016131944-fbf46a3f350f/go.mod h1:bCnBosofpaUxKBuTTJM3/I3thAK37kvfBnKByjnLsl4=
306306
github.com/instill-ai/usage-client v0.4.0 h1:xf1hAlO4a8lZwZzz9bprZOJqU3ghIcIsavUUB7UURyg=
307307
github.com/instill-ai/usage-client v0.4.0/go.mod h1:zZ9LRoXps2u63ARYPAbR2YvqTib3dWJLObZn+9YqhF0=
308308
github.com/instill-ai/x v0.10.0-alpha h1:I83WJc+21J+IgI4aJDn755ON/BX4cDvKCVVguI77r14=

integration-test/const.js

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -102,19 +102,19 @@ export const sampleXlsx = encoding.b64encode(
102102

103103
// Mapping of sample files to their intended artifact file types
104104
export const sampleFiles = [
105-
{ originalName: "sample.txt", type: "FILE_TYPE_TEXT", content: sampleTxt },
106-
{ originalName: "sample.md", type: "FILE_TYPE_MARKDOWN", content: sampleMd },
107-
{ originalName: "sample.csv", type: "FILE_TYPE_CSV", content: sampleCsv },
108-
{ originalName: "sample.html", type: "FILE_TYPE_HTML", content: sampleHtml },
109-
{ originalName: "sample.pdf", type: "FILE_TYPE_PDF", content: samplePdf },
110-
{ originalName: "sample.ppt", type: "FILE_TYPE_PPT", content: samplePpt },
111-
{ originalName: "sample.pptx", type: "FILE_TYPE_PPTX", content: samplePptx },
112-
{ originalName: "sample.xls", type: "FILE_TYPE_XLS", content: sampleXls },
113-
{ originalName: "sample.xlsx", type: "FILE_TYPE_XLSX", content: sampleXlsx },
114-
{ originalName: "sample.doc", type: "FILE_TYPE_DOC", content: sampleDoc },
115-
{ originalName: "sample.docx", type: "FILE_TYPE_DOCX", content: sampleDocx },
116-
{ originalName: "SAMPLE-UPPERCASE-FILENAME.DOC", type: "FILE_TYPE_DOC", content: sampleUppercaseDoc },
117-
{ originalName: "SAMPLE-UPPERCASE-FILENAME.DOCX", type: "FILE_TYPE_DOCX", content: sampleUppercaseDocx },
105+
{ originalName: "sample.txt", type: "TYPE_TEXT", content: sampleTxt },
106+
{ originalName: "sample.md", type: "TYPE_MARKDOWN", content: sampleMd },
107+
{ originalName: "sample.csv", type: "TYPE_CSV", content: sampleCsv },
108+
{ originalName: "sample.html", type: "TYPE_HTML", content: sampleHtml },
109+
{ originalName: "sample.pdf", type: "TYPE_PDF", content: samplePdf },
110+
{ originalName: "sample.ppt", type: "TYPE_PPT", content: samplePpt },
111+
{ originalName: "sample.pptx", type: "TYPE_PPTX", content: samplePptx },
112+
{ originalName: "sample.xls", type: "TYPE_XLS", content: sampleXls },
113+
{ originalName: "sample.xlsx", type: "TYPE_XLSX", content: sampleXlsx },
114+
{ originalName: "sample.doc", type: "TYPE_DOC", content: sampleDoc },
115+
{ originalName: "sample.docx", type: "TYPE_DOCX", content: sampleDocx },
116+
{ originalName: "SAMPLE-UPPERCASE-FILENAME.DOC", type: "TYPE_DOC", content: sampleUppercaseDoc },
117+
{ originalName: "SAMPLE-UPPERCASE-FILENAME.DOCX", type: "TYPE_DOCX", content: sampleUppercaseDocx },
118118
];
119119

120120
let dbHost = 'localhost';

integration-test/grpc-public-with-jwt.js

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ export function CheckUploadCatalogFile(client, data) {
1414
const cRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/CreateCatalog", { namespaceId: data.expectedOwner.id, name: constant.dbIDPrefix + randomString(10), description: randomString(30), tags: ["test", "integration", "grpc"], type: "CATALOG_TYPE_PERSISTENT" }, data.metadata);
1515
const catalog = cRes.message.catalog;
1616

17-
const reqBody = { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-jwt-" + randomString(5) + ".docx", type: "FILE_TYPE_DOCX", content: constant.sampleDocx } };
17+
const reqBody = { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-jwt-" + randomString(5) + ".docx", type: "TYPE_DOCX", content: constant.sampleDocx } };
1818
// Invoke with invalid Authorization metadata → expect Unauthenticated/PermissionDenied
1919
const resNeg = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", reqBody, constant.paramsGRPCWithJwt);
2020
check(resNeg, {
@@ -35,7 +35,7 @@ export function CheckListCatalogFiles(client, data) {
3535
// Create resources with authorized metadata
3636
const cRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/CreateCatalog", { namespaceId: data.expectedOwner.id, name: constant.dbIDPrefix + randomString(10), description: randomString(30), tags: ["test", "integration", "grpc"], type: "CATALOG_TYPE_PERSISTENT" }, data.metadata);
3737
const catalog = cRes.message.catalog;
38-
client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-jwt-" + randomString(5) + ".docx", type: "FILE_TYPE_DOCX", content: constant.sampleDocx } }, data.metadata);
38+
client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-jwt-" + randomString(5) + ".docx", type: "TYPE_DOCX", content: constant.sampleDocx } }, data.metadata);
3939

4040
// Negative: list with invalid Authorization
4141
const resNeg = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/ListCatalogFiles", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, pageSize: 10 }, constant.paramsGRPCWithJwt);
@@ -54,7 +54,7 @@ export function CheckGetCatalogFile(client, data) {
5454

5555
const cRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/CreateCatalog", { namespaceId: data.expectedOwner.id, name: constant.dbIDPrefix + randomString(10), description: randomString(30), tags: ["test", "integration", "grpc"], type: "CATALOG_TYPE_PERSISTENT" }, data.metadata);
5656
const catalog = cRes.message.catalog;
57-
const fRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-jwt-" + randomString(5) + ".docx", type: "FILE_TYPE_DOCX", content: constant.sampleDocx } }, data.metadata);
57+
const fRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-jwt-" + randomString(5) + ".docx", type: "TYPE_DOCX", content: constant.sampleDocx } }, data.metadata);
5858
const file = fRes.message.file;
5959

6060
// Negative: get file with invalid Authorization

integration-test/grpc-public.js

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -17,13 +17,13 @@ export function CheckUploadCatalogFile(client, data) {
1717
);
1818
const catalog = cRes.message && cRes.message.catalog ? cRes.message.catalog : {};
1919

20-
const reqBody = { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-" + randomString(5) + ".doc", type: "FILE_TYPE_DOC", content: constant.sampleDoc } };
20+
const reqBody = { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-" + randomString(5) + ".doc", type: "TYPE_DOC", content: constant.sampleDoc } };
2121
const resOrigin = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", reqBody, data.metadata);
2222
check(resOrigin, {
2323
"artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile response status is StatusOK": (r) => r.status === grpc.StatusOK,
2424
"artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile response file name": (r) => r.message.file.name === reqBody.file.name,
2525
"artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile response file uid": (r) => helper.isUUID(r.message.file.fileUid),
26-
"artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile response file type": (r) => r.message.file.type === "FILE_TYPE_DOC",
26+
"artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile response file type": (r) => r.message.file.type === "TYPE_DOC",
2727
"artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile response file is valid": (r) => helper.validateFileGRPC(r.message.file, false),
2828
});
2929

@@ -39,7 +39,7 @@ export function CheckListCatalogFiles(client, data) {
3939

4040
const cRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/CreateCatalog", { namespaceId: data.expectedOwner.id, name: constant.dbIDPrefix + randomString(10), description: randomString(30), tags: ["test", "integration", "grpc"], type: "CATALOG_TYPE_PERSISTENT" }, data.metadata);
4141
const catalog = cRes.message && cRes.message.catalog ? cRes.message.catalog : {};
42-
const fRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-" + randomString(5) + ".doc", type: "FILE_TYPE_DOC", content: constant.sampleDoc } }, data.metadata);
42+
const fRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-" + randomString(5) + ".doc", type: "TYPE_DOC", content: constant.sampleDoc } }, data.metadata);
4343

4444
const resOrigin = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/ListCatalogFiles", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, pageSize: 10 }, data.metadata);
4545
check(resOrigin, {
@@ -58,7 +58,7 @@ export function CheckGetCatalogFile(client, data) {
5858

5959
const cRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/CreateCatalog", { namespaceId: data.expectedOwner.id, name: constant.dbIDPrefix + randomString(10), description: randomString(30), tags: ["test", "integration", "grpc"], type: "CATALOG_TYPE_PERSISTENT" }, data.metadata);
6060
const catalog = cRes.message && cRes.message.catalog ? cRes.message.catalog : {};
61-
const fRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-" + randomString(5) + ".doc", type: "FILE_TYPE_DOC", content: constant.sampleDoc } }, data.metadata);
61+
const fRes = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "test-file-grpc-" + randomString(5) + ".doc", type: "TYPE_DOC", content: constant.sampleDoc } }, data.metadata);
6262
const file = fRes.message.file;
6363

6464
const resOrigin = client.invoke("artifact.artifact.v1alpha.ArtifactPublicService/GetCatalogFile", { namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, fileUid: file.fileUid }, data.metadata);
@@ -172,7 +172,7 @@ export function CheckCleanupOnFileDeletion(client, data) {
172172
// Upload a PDF file (will trigger conversion)
173173
const fRes = client.invoke(
174174
"artifact.artifact.v1alpha.ArtifactPublicService/UploadCatalogFile",
175-
{ namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "clf.pdf", type: "FILE_TYPE_PDF", content: constant.samplePdf } },
175+
{ namespaceId: data.expectedOwner.id, catalogId: catalog.catalogId, file: { name: constant.dbIDPrefix + "clf.pdf", type: "TYPE_PDF", content: constant.samplePdf } },
176176
data.metadata
177177
);
178178
const file = fRes.message && fRes.message.file ? fRes.message.file : {};

0 commit comments

Comments
 (0)