Skip to content

Commit 925dd1c

Browse files
authored
feat(conversion): extract pages in conversion step (#260)
Because - The service layer is ready for adding the following metadata: - Page length to file - Page delimiters to converted file - Page references to chunk This commit - Adds utility functions for page extraction - Returns one string per page on conversion pipeline - Parses conversion results from pipeline - Adds page length check in integration-tests
1 parent 56f0368 commit 925dd1c

File tree

12 files changed

+317
-68
lines changed

12 files changed

+317
-68
lines changed

integration-test/proto/artifact/artifact/v1alpha/artifact.proto

Lines changed: 35 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -415,6 +415,31 @@ enum FileType {
415415

416416
// file
417417
message File {
418+
// Position within a file, as coordinates in a a specific unit. The
419+
// number of dimensions of the coordinate depends on the unit type.
420+
message Position {
421+
// Unit of measurement for a position within a file.
422+
enum Unit {
423+
// Unspecified.
424+
UNIT_UNSPECIFIED = 0;
425+
// Character positions (for Markdown and other text files).
426+
UNIT_CHARACTER = 1;
427+
// Page positions (for documents). For pages, positions are 1-indexed
428+
// (e.g., page 4 of 4) to align with document visualization standards.
429+
UNIT_PAGE = 2;
430+
// Time positions in milliseconds (for audio/video files).
431+
UNIT_TIME_MS = 3;
432+
// Pixel positions (for images and other 2D content).
433+
UNIT_PIXEL = 4;
434+
}
435+
436+
// Unit of measurement for the position.
437+
Unit unit = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
438+
439+
// Position value.
440+
repeated uint32 coordinates = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
441+
}
442+
418443
// file uid
419444
string file_uid = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
420445
// file name
@@ -484,6 +509,10 @@ message File {
484509
// (such files are typically trivial to convert and don't require a dedicated
485510
// pipeline to improve the conversion performance).
486511
optional string converting_pipeline = 21;
512+
// Length of the file in the specified unit type. It is defined as the number
513+
// of positions (the unit will depend on the file type) that can be accessed
514+
// in the file.
515+
Position length = 22 [(google.api.field_behavior) = OUTPUT_ONLY];
487516
}
488517

489518
// upload file request
@@ -526,11 +555,13 @@ message ProcessCatalogFilesResponse {
526555
repeated File files = 1;
527556
}
528557

529-
// list file filter
530-
// todo: support more parameters
558+
// ListCatalogFilesFilter contains a set of properties to filter a catalog file
559+
// list by.
531560
message ListCatalogFilesFilter {
532-
// The file uids.
533-
repeated string file_uids = 2 [(google.api.field_behavior) = OPTIONAL];
561+
// File UIDs.
562+
repeated string file_uids = 1 [(google.api.field_behavior) = OPTIONAL];
563+
// Processing status of the files.
564+
FileProcessStatus process_status = 2 [(google.api.field_behavior) = OPTIONAL];
534565
}
535566

536567
// list files request

integration-test/proto/artifact/artifact/v1alpha/artifact_public_service.proto

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -380,7 +380,6 @@ service ArtifactPublicService {
380380
//
381381
// Returns the upload URL of an object.
382382
rpc GetObjectUploadURL(GetObjectUploadURLRequest) returns (GetObjectUploadURLResponse) {
383-
option deprecated = true;
384383
option (google.api.http) = {get: "/v1alpha/namespaces/{namespace_id}/object-upload-url"};
385384
option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = {
386385
tags: "Artifact"
@@ -395,7 +394,6 @@ service ArtifactPublicService {
395394
//
396395
// Returns the download URL of an object.
397396
rpc GetObjectDownloadURL(GetObjectDownloadURLRequest) returns (GetObjectDownloadURLResponse) {
398-
option deprecated = true;
399397
option (google.api.http) = {get: "/v1alpha/namespaces/{namespace_id}/object-download-url"};
400398
option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = {
401399
tags: "Artifact"

integration-test/proto/artifact/artifact/v1alpha/chunk.proto

Lines changed: 26 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ syntax = "proto3";
22

33
package artifact.artifact.v1alpha;
44

5+
import "artifact/artifact/v1alpha/artifact.proto";
56
// Protocol Buffers Well-Known Types
67
import "google/api/field_behavior.proto";
78
import "google/protobuf/timestamp.proto";
@@ -34,14 +35,18 @@ enum ContentType {
3435

3536
// The Chunk message represents a chunk of data in the artifact system.
3637
message Chunk {
38+
// Reference represents the position of a chunk within a file.
39+
message Reference {
40+
// Start position of the chunk within the file.
41+
File.Position start = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
42+
// End position of the chunk within the file.
43+
File.Position end = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
44+
}
45+
3746
// unique identifier of the chunk
3847
string chunk_uid = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
3948
// whether the chunk is retrievable
4049
bool retrievable = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
41-
// start position of the chunk in the source file
42-
uint32 start_pos = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
43-
// end position of the chunk in the source file
44-
uint32 end_pos = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
4550
// tokens of the chunk
4651
uint32 tokens = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
4752
// creation time of the chunk
@@ -50,6 +55,23 @@ message Chunk {
5055
string original_file_uid = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
5156
// content type
5257
ContentType content_type = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
58+
// Reference to the position of the chunk within the original file.
59+
Reference reference = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
60+
// Reference to the position of the chunk within the Markdown (source) file.
61+
Reference markdown_reference = 11 [(google.api.field_behavior) = OUTPUT_ONLY];
62+
63+
// start position of the chunk in the source file
64+
// Deprecated: use markdown_reference instead
65+
uint32 start_pos = 4 [
66+
deprecated = true,
67+
(google.api.field_behavior) = OUTPUT_ONLY
68+
];
69+
// end position of the chunk in the source file
70+
// Deprecated: use markdown_reference instead
71+
uint32 end_pos = 5 [
72+
deprecated = true,
73+
(google.api.field_behavior) = OUTPUT_ONLY
74+
];
5375
}
5476

5577
// The ListChunksRequest message represents a request to list chunks in the artifact system.

integration-test/proto/artifact/artifact/v1alpha/object.proto

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ import "google/protobuf/timestamp.proto";
99

1010
// Object
1111
message Object {
12-
option deprecated = true;
1312
// uid
1413
string uid = 1;
1514
// name of the object
@@ -40,7 +39,6 @@ message Object {
4039

4140
// GetObjectUploadURLRequest
4241
message GetObjectUploadURLRequest {
43-
option deprecated = true;
4442
// id of the namespace
4543
string namespace_id = 1 [(google.api.field_behavior) = REQUIRED];
4644
// name of the object with length limit to 1024 characters.
@@ -59,7 +57,6 @@ message GetObjectUploadURLRequest {
5957

6058
// GetObjectUploadURLResponse
6159
message GetObjectUploadURLResponse {
62-
option deprecated = true;
6360
// upload url
6461
string upload_url = 1;
6562
// expire at in UTC (UTC+0)
@@ -70,7 +67,6 @@ message GetObjectUploadURLResponse {
7067

7168
// GetObjectDownloadURLRequest
7269
message GetObjectDownloadURLRequest {
73-
option deprecated = true;
7470
// id of the namespace
7571
string namespace_id = 1 [(google.api.field_behavior) = REQUIRED];
7672
// uid of the object
@@ -82,7 +78,6 @@ message GetObjectDownloadURLRequest {
8278

8379
// GetObjectDownloadURLResponse
8480
message GetObjectDownloadURLResponse {
85-
option deprecated = true;
8681
// download url
8782
string download_url = 1;
8883
// expire at in UTC (UTC+0)

integration-test/rest-public.js

Lines changed: 13 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,6 +340,19 @@ export function CheckCatalog(data) {
340340
[`GET ${viewPath} file has summary (${f.name}: ${f.type})`]: (r) => r.json().files[0].summary.length > 0,
341341
[`GET ${viewPath} file has downloadUrl (${f.name}: ${f.type})`]: (r) => r.json().files[0].downloadUrl.includes("v1alpha/blob-urls/"),
342342
});
343+
344+
// Check conversion pipeline and page information depending on the file type
345+
const isDocumentType = ["FILE_TYPE_PDF", "FILE_TYPE_DOC", "FILE_TYPE_DOCX", "FILE_TYPE_PPT", "FILE_TYPE_PPTX"].includes(f.type);
346+
347+
if (isDocumentType) {
348+
// For document types, check that length unit is pages and coordinates contain page count
349+
const fileData = viewRes.json().files[0];
350+
check(viewRes, {
351+
[`GET ${viewPath} file has length unit UNIT_PAGE (${f.name}: ${f.type})`]: () => fileData.length && fileData.length.unit === "UNIT_PAGE",
352+
[`GET ${viewPath} file has length coordinates (${f.name}: ${f.type})`]: () => fileData.length && Array.isArray(fileData.length.coordinates) && fileData.length.coordinates.length > 0,
353+
[`GET ${viewPath} file length coordinates is positive (${f.name}: ${f.type})`]: () => fileData.length && fileData.length.coordinates[0] > 0,
354+
});
355+
}
343356
}
344357

345358
// List catalog files

pkg/service/conversion.go

Lines changed: 96 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ package service
33
import (
44
"context"
55
"fmt"
6+
"strings"
67
"time"
78

89
"go.uber.org/zap"
@@ -39,27 +40,53 @@ type MDConversionResult struct {
3940

4041
// convertResultParser extracts the conversion result from the pipeline
4142
// response. It first checks for a non-empty "convert_result" field, then falls
42-
// back to "convert_result2". Returns an error if neither field contains valid
43-
// data or if the response structure is invalid.
44-
func convertResultParser(resp *pipelinepb.TriggerNamespacePipelineReleaseResponse) (string, error) {
43+
// back to "convert_result2". It handles both single strings (for non-page-based
44+
// files) and arrays of strings (for page-based files). Returns an error if
45+
// neither field contains valid data or if the response structure is invalid.
46+
func convertResultParser(resp *pipelinepb.TriggerNamespacePipelineReleaseResponse) (*MDConversionResult, error) {
4547
if resp == nil || len(resp.Outputs) == 0 {
46-
return "", fmt.Errorf("response is nil or has no outputs. resp: %v", resp)
48+
return nil, fmt.Errorf("response is nil or has no outputs. resp: %v", resp)
4749
}
4850
fields := resp.Outputs[0].GetFields()
4951
if fields == nil {
50-
return "", fmt.Errorf("fields in the output are nil. resp: %v", resp)
52+
return nil, fmt.Errorf("fields in the output are nil. resp: %v", resp)
5153
}
5254

55+
// Try convert_result first, then convert_result2 as fallback
56+
suffix := "\n"
5357
convertResult, ok := fields["convert_result"]
54-
if ok && convertResult.GetStringValue() != "" {
55-
return convertResult.GetStringValue(), nil
58+
if !ok {
59+
convertResult, ok = fields["convert_result2"]
60+
if !ok {
61+
return nil, fmt.Errorf("no conversion result fields found in response")
62+
}
63+
64+
suffix = ""
5665
}
57-
convertResult2, ok2 := fields["convert_result2"]
58-
if ok2 && convertResult2.GetStringValue() != "" {
59-
return convertResult2.GetStringValue(), nil
66+
67+
// Check if it's a list (page-based files)
68+
if list := convertResult.GetListValue(); list != nil {
69+
pages := ProtoListToStrings(list, suffix)
70+
if len(pages) == 0 {
71+
return nil, fmt.Errorf("empty page list in conversion result")
72+
}
73+
74+
return &MDConversionResult{
75+
Markdown: strings.Join(pages, ""),
76+
Length: []uint32{uint32(len(pages))},
77+
PositionData: PositionDataFromPages(pages),
78+
}, nil
6079
}
6180

62-
return "", nil
81+
// Check if it's a string (non-page-based files)
82+
md := convertResult.GetStringValue()
83+
if md == "" {
84+
return nil, fmt.Errorf("empty markdown string in conversion result")
85+
}
86+
return &MDConversionResult{
87+
Markdown: md,
88+
// No length or position data for non-page-based files
89+
}, nil
6390
}
6491

6592
// ConvertToMDPipe converts a file into Markdown by triggering a converting
@@ -107,12 +134,17 @@ func (s *service) ConvertToMDPipe(ctx context.Context, p MDConversionParams) (*M
107134
artifactpb.FileType_FILE_TYPE_PPT,
108135
artifactpb.FileType_FILE_TYPE_PPTX:
109136

110-
if len(p.Pipelines) != 0 {
111-
break
112-
}
113-
114-
p.Pipelines = DefaultConversionPipelines
137+
// If this is a reprocessing scenario with the default pipeline (same
138+
// namespace and ID, but potentially different version), reprocess with
139+
// the newest version of the default pipeline
140+
reprocessWithDefaultPipeline := len(p.Pipelines) == 1 &&
141+
p.Pipelines[0].Namespace == ConvertDocToMDPipeline.Namespace &&
142+
p.Pipelines[0].ID == ConvertDocToMDPipeline.ID
115143

144+
//
145+
if len(p.Pipelines) == 0 || reprocessWithDefaultPipeline {
146+
p.Pipelines = DefaultConversionPipelines
147+
}
116148
default:
117149
return nil, fmt.Errorf("unsupported file type: %v", p.Type)
118150
}
@@ -130,27 +162,63 @@ func (s *service) ConvertToMDPipe(ctx context.Context, p MDConversionParams) (*M
130162
return nil, fmt.Errorf("triggering %s pipeline: %w", pipeline.ID, err)
131163
}
132164

133-
md, err := convertResultParser(resp)
165+
result, err := convertResultParser(resp)
134166
if err != nil {
135167
return nil, fmt.Errorf("getting conversion result: %w", err)
136168
}
137169

138-
if md == "" {
170+
if result == nil || result.Markdown == "" {
139171
logger.Info("Conversion pipeline didn't yield results", zap.String("pipeline", pipeline.Name()))
140172
continue
141173
}
142174

143-
return &MDConversionResult{
144-
Markdown: md,
145-
PipelineRelease: pipeline,
146-
147-
// TODO jvallesm: read the length and position data from the
148-
// pipeline results. First we'll update the conversion method
149-
// interface and implement the changes in the clients. After
150-
// verifying these are backwards-compatible, we'll implement the
151-
// length and position extraction.
152-
}, nil
175+
// Set the pipeline release used for this conversion
176+
result.PipelineRelease = pipeline
177+
return result, nil
153178
}
154179

155180
return nil, fmt.Errorf("conversion pipelines didn't produce any result")
156181
}
182+
183+
// ProtoListToStrings returns a proto list of strings as a string slice. The empty
184+
// elements will be removed. A suffix can be passed, which will be appended to
185+
// all the elements but the last one. This will produce the same effect than
186+
// strings.Join(asStrings, suffix) in upstream code, but allows for page
187+
// delimiter extraction before that step.
188+
func ProtoListToStrings(list *structpb.ListValue, suffix string) []string {
189+
values := list.GetValues()
190+
asStrings := make([]string, 0, len(values))
191+
for i, v := range values {
192+
s := v.GetStringValue()
193+
if s == "" {
194+
continue
195+
}
196+
197+
if len(suffix) > 0 && !strings.HasSuffix(s, suffix) && i < len(values)-1 {
198+
s = s + suffix
199+
}
200+
201+
asStrings = append(asStrings, s)
202+
}
203+
204+
return asStrings
205+
}
206+
207+
// PositionDataFromPages extracts the page delimiters from a list of pages.
208+
func PositionDataFromPages(pages []string) *repository.PositionData {
209+
if len(pages) == 0 {
210+
return nil
211+
}
212+
213+
var offset uint32
214+
positionData := &repository.PositionData{
215+
PageDelimiters: make([]uint32, len(pages)),
216+
}
217+
218+
for i, page := range pages {
219+
offset += uint32(len([]rune(page)))
220+
positionData.PageDelimiters[i] = offset
221+
}
222+
223+
return positionData
224+
}

0 commit comments

Comments
 (0)