instill-ai
diff --git a/‎integration-test/proto/artifact/artifact/v1alpha/artifact.proto‎
Lines changed: 35 additions & 4 deletions b/‎integration-test/proto/artifact/artifact/v1alpha/artifact.proto‎
Lines changed: 35 additions & 4 deletions
diff --git a/‎integration-test/proto/artifact/artifact/v1alpha/artifact_public_service.proto‎
Lines changed: 0 additions & 2 deletions b/‎integration-test/proto/artifact/artifact/v1alpha/artifact_public_service.proto‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎integration-test/proto/artifact/artifact/v1alpha/chunk.proto‎
Lines changed: 26 additions & 4 deletions b/‎integration-test/proto/artifact/artifact/v1alpha/chunk.proto‎
Lines changed: 26 additions & 4 deletions
diff --git a/‎integration-test/proto/artifact/artifact/v1alpha/object.proto‎
Lines changed: 0 additions & 5 deletions b/‎integration-test/proto/artifact/artifact/v1alpha/object.proto‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎integration-test/rest-public.js‎
Lines changed: 13 additions & 0 deletions b/‎integration-test/rest-public.js‎
Lines changed: 13 additions & 0 deletions
diff --git a/‎pkg/service/conversion.go‎
Lines changed: 96 additions & 28 deletions b/‎pkg/service/conversion.go‎
Lines changed: 96 additions & 28 deletions
@@ -415,6 +415,31 @@ enum FileType {
 
 // file
 message File {
+  // Position within a file, as coordinates in a a specific unit. The
+  // number of dimensions of the coordinate depends on the unit type.
+  message Position {
+    // Unit of measurement for a position within a file.
+    enum Unit {
+      // Unspecified.
+      UNIT_UNSPECIFIED = 0;
+      // Character positions (for Markdown and other text files).
+      UNIT_CHARACTER = 1;
+      // Page positions (for documents). For pages, positions are 1-indexed
+      // (e.g., page 4 of 4) to align with document visualization standards.
+      UNIT_PAGE = 2;
+      // Time positions in milliseconds (for audio/video files).
+      UNIT_TIME_MS = 3;
+      // Pixel positions (for images and other 2D content).
+      UNIT_PIXEL = 4;
+    }
+
+    // Unit of measurement for the position.
+    Unit unit = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
+
+    // Position value.
+    repeated uint32 coordinates = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
+  }
+
   // file uid
   string file_uid = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
   // file name
@@ -484,6 +509,10 @@ message File {
   // (such files are typically trivial to convert and don't require a dedicated
   // pipeline to improve the conversion performance).
   optional string converting_pipeline = 21;
+  // Length of the file in the specified unit type. It is defined as the number
+  // of positions (the unit will depend on the file type) that can be accessed
+  // in the file.
+  Position length = 22 [(google.api.field_behavior) = OUTPUT_ONLY];
 }
 
 // upload file request
@@ -526,11 +555,13 @@ message ProcessCatalogFilesResponse {
   repeated File files = 1;
 }
 
-// list file filter
-// todo: support more parameters
+// ListCatalogFilesFilter contains a set of properties to filter a catalog file
+// list by.
 message ListCatalogFilesFilter {
-  // The file uids.
-  repeated string file_uids = 2 [(google.api.field_behavior) = OPTIONAL];
+  // File UIDs.
+  repeated string file_uids = 1 [(google.api.field_behavior) = OPTIONAL];
+  // Processing status of the files.
+  FileProcessStatus process_status = 2 [(google.api.field_behavior) = OPTIONAL];
 }
 
 // list files request
 
@@ -380,7 +380,6 @@ service ArtifactPublicService {
   //
   // Returns the upload URL of an object.
   rpc GetObjectUploadURL(GetObjectUploadURLRequest) returns (GetObjectUploadURLResponse) {
-    option deprecated = true;
     option (google.api.http) = {get: "/v1alpha/namespaces/{namespace_id}/object-upload-url"};
     option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = {
       tags: "Artifact"
@@ -395,7 +394,6 @@ service ArtifactPublicService {
   //
   // Returns the download URL of an object.
   rpc GetObjectDownloadURL(GetObjectDownloadURLRequest) returns (GetObjectDownloadURLResponse) {
-    option deprecated = true;
     option (google.api.http) = {get: "/v1alpha/namespaces/{namespace_id}/object-download-url"};
     option (grpc.gateway.protoc_gen_openapiv2.options.openapiv2_operation) = {
       tags: "Artifact"
 
@@ -2,6 +2,7 @@ syntax = "proto3";
 
 package artifact.artifact.v1alpha;
 
+import "artifact/artifact/v1alpha/artifact.proto";
 // Protocol Buffers Well-Known Types
 import "google/api/field_behavior.proto";
 import "google/protobuf/timestamp.proto";
@@ -34,14 +35,18 @@ enum ContentType {
 
 // The Chunk message represents a chunk of data in the artifact system.
 message Chunk {
+  // Reference represents the position of a chunk within a file.
+  message Reference {
+    // Start position of the chunk within the file.
+    File.Position start = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
+    // End position of the chunk within the file.
+    File.Position end = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
+  }
+
   // unique identifier of the chunk
   string chunk_uid = 1 [(google.api.field_behavior) = OUTPUT_ONLY];
   // whether the chunk is retrievable
   bool retrievable = 2 [(google.api.field_behavior) = OUTPUT_ONLY];
-  // start position of the chunk in the source file
-  uint32 start_pos = 4 [(google.api.field_behavior) = OUTPUT_ONLY];
-  // end position of the chunk in the source file
-  uint32 end_pos = 5 [(google.api.field_behavior) = OUTPUT_ONLY];
   // tokens of the chunk
   uint32 tokens = 6 [(google.api.field_behavior) = OUTPUT_ONLY];
   // creation time of the chunk
@@ -50,6 +55,23 @@ message Chunk {
   string original_file_uid = 8 [(google.api.field_behavior) = OUTPUT_ONLY];
   // content type
   ContentType content_type = 9 [(google.api.field_behavior) = OUTPUT_ONLY];
+  // Reference to the position of the chunk within the original file.
+  Reference reference = 10 [(google.api.field_behavior) = OUTPUT_ONLY];
+  // Reference to the position of the chunk within the Markdown (source) file.
+  Reference markdown_reference = 11 [(google.api.field_behavior) = OUTPUT_ONLY];
+
+  // start position of the chunk in the source file
+  // Deprecated: use markdown_reference instead
+  uint32 start_pos = 4 [
+    deprecated = true,
+    (google.api.field_behavior) = OUTPUT_ONLY
+  ];
+  // end position of the chunk in the source file
+  // Deprecated: use markdown_reference instead
+  uint32 end_pos = 5 [
+    deprecated = true,
+    (google.api.field_behavior) = OUTPUT_ONLY
+  ];
 }
 
 // The ListChunksRequest message represents a request to list chunks in the artifact system.
 
@@ -9,7 +9,6 @@ import "google/protobuf/timestamp.proto";
 
 // Object
 message Object {
-  option deprecated = true;
   // uid
   string uid = 1;
   // name of the object
@@ -40,7 +39,6 @@ message Object {
 
 // GetObjectUploadURLRequest
 message GetObjectUploadURLRequest {
-  option deprecated = true;
   // id of the namespace
   string namespace_id = 1 [(google.api.field_behavior) = REQUIRED];
   // name of the object with length limit to 1024 characters.
@@ -59,7 +57,6 @@ message GetObjectUploadURLRequest {
 
 // GetObjectUploadURLResponse
 message GetObjectUploadURLResponse {
-  option deprecated = true;
   // upload url
   string upload_url = 1;
   // expire at in UTC (UTC+0)
@@ -70,7 +67,6 @@ message GetObjectUploadURLResponse {
 
 // GetObjectDownloadURLRequest
 message GetObjectDownloadURLRequest {
-  option deprecated = true;
   // id of the namespace
   string namespace_id = 1 [(google.api.field_behavior) = REQUIRED];
   // uid of the object
@@ -82,7 +78,6 @@ message GetObjectDownloadURLRequest {
 
 // GetObjectDownloadURLResponse
 message GetObjectDownloadURLResponse {
-  option deprecated = true;
   // download url
   string download_url = 1;
   // expire at in UTC (UTC+0)
 
@@ -340,6 +340,19 @@ export function CheckCatalog(data) {
         [`GET ${viewPath} file has summary (${f.name}: ${f.type})`]: (r) => r.json().files[0].summary.length > 0,
         [`GET ${viewPath} file has downloadUrl (${f.name}: ${f.type})`]: (r) => r.json().files[0].downloadUrl.includes("v1alpha/blob-urls/"),
       });
+
+      // Check conversion pipeline and page information depending on the file type
+      const isDocumentType = ["FILE_TYPE_PDF", "FILE_TYPE_DOC", "FILE_TYPE_DOCX", "FILE_TYPE_PPT", "FILE_TYPE_PPTX"].includes(f.type);
+
+      if (isDocumentType) {
+        // For document types, check that length unit is pages and coordinates contain page count
+        const fileData = viewRes.json().files[0];
+        check(viewRes, {
+          [`GET ${viewPath} file has length unit UNIT_PAGE (${f.name}: ${f.type})`]: () => fileData.length && fileData.length.unit === "UNIT_PAGE",
+          [`GET ${viewPath} file has length coordinates (${f.name}: ${f.type})`]: () => fileData.length && Array.isArray(fileData.length.coordinates) && fileData.length.coordinates.length > 0,
+          [`GET ${viewPath} file length coordinates is positive (${f.name}: ${f.type})`]: () => fileData.length && fileData.length.coordinates[0] > 0,
+        });
+      }
     }
 
     // List catalog files
 
@@ -3,6 +3,7 @@ package service
 import (
 	"context"
 	"fmt"
+	"strings"
 	"time"
 
 	"go.uber.org/zap"
@@ -39,27 +40,53 @@ type MDConversionResult struct {
 
 // convertResultParser extracts the conversion result from the pipeline
 // response. It first checks for a non-empty "convert_result" field, then falls
-// back to "convert_result2". Returns an error if neither field contains valid
-// data or if the response structure is invalid.
-func convertResultParser(resp *pipelinepb.TriggerNamespacePipelineReleaseResponse) (string, error) {
+// back to "convert_result2". It handles both single strings (for non-page-based
+// files) and arrays of strings (for page-based files). Returns an error if
+// neither field contains valid data or if the response structure is invalid.
+func convertResultParser(resp *pipelinepb.TriggerNamespacePipelineReleaseResponse) (*MDConversionResult, error) {
 	if resp == nil || len(resp.Outputs) == 0 {
-		return "", fmt.Errorf("response is nil or has no outputs. resp: %v", resp)
+		return nil, fmt.Errorf("response is nil or has no outputs. resp: %v", resp)
 	}
 	fields := resp.Outputs[0].GetFields()
 	if fields == nil {
-		return "", fmt.Errorf("fields in the output are nil. resp: %v", resp)
+		return nil, fmt.Errorf("fields in the output are nil. resp: %v", resp)
 	}
 
+	// Try convert_result first, then convert_result2 as fallback
+	suffix := "\n"
 	convertResult, ok := fields["convert_result"]
-	if ok && convertResult.GetStringValue() != "" {
-		return convertResult.GetStringValue(), nil
+	if !ok {
+		convertResult, ok = fields["convert_result2"]
+		if !ok {
+			return nil, fmt.Errorf("no conversion result fields found in response")
+		}
+
+		suffix = ""
 	}
-	convertResult2, ok2 := fields["convert_result2"]
-	if ok2 && convertResult2.GetStringValue() != "" {
-		return convertResult2.GetStringValue(), nil
+
+	// Check if it's a list (page-based files)
+	if list := convertResult.GetListValue(); list != nil {
+		pages := ProtoListToStrings(list, suffix)
+		if len(pages) == 0 {
+			return nil, fmt.Errorf("empty page list in conversion result")
+		}
+
+		return &MDConversionResult{
+			Markdown:     strings.Join(pages, ""),
+			Length:       []uint32{uint32(len(pages))},
+			PositionData: PositionDataFromPages(pages),
+		}, nil
 	}
 
-	return "", nil
+	// Check if it's a string (non-page-based files)
+	md := convertResult.GetStringValue()
+	if md == "" {
+		return nil, fmt.Errorf("empty markdown string in conversion result")
+	}
+	return &MDConversionResult{
+		Markdown: md,
+		// No length or position data for non-page-based files
+	}, nil
 }
 
 // ConvertToMDPipe converts a file into Markdown by triggering a converting
@@ -107,12 +134,17 @@ func (s *service) ConvertToMDPipe(ctx context.Context, p MDConversionParams) (*M
 		artifactpb.FileType_FILE_TYPE_PPT,
 		artifactpb.FileType_FILE_TYPE_PPTX:
 
-		if len(p.Pipelines) != 0 {
-			break
-		}
-
-		p.Pipelines = DefaultConversionPipelines
+		// If this is a reprocessing scenario with the default pipeline (same
+		// namespace and ID, but potentially different version), reprocess with
+		// the newest version of the default pipeline
+		reprocessWithDefaultPipeline := len(p.Pipelines) == 1 &&
+			p.Pipelines[0].Namespace == ConvertDocToMDPipeline.Namespace &&
+			p.Pipelines[0].ID == ConvertDocToMDPipeline.ID
 
+		//
+		if len(p.Pipelines) == 0 || reprocessWithDefaultPipeline {
+			p.Pipelines = DefaultConversionPipelines
+		}
 	default:
 		return nil, fmt.Errorf("unsupported file type: %v", p.Type)
 	}
@@ -130,27 +162,63 @@ func (s *service) ConvertToMDPipe(ctx context.Context, p MDConversionParams) (*M
 			return nil, fmt.Errorf("triggering %s pipeline: %w", pipeline.ID, err)
 		}
 
-		md, err := convertResultParser(resp)
+		result, err := convertResultParser(resp)
 		if err != nil {
 			return nil, fmt.Errorf("getting conversion result: %w", err)
 		}
 
-		if md == "" {
+		if result == nil || result.Markdown == "" {
 			logger.Info("Conversion pipeline didn't yield results", zap.String("pipeline", pipeline.Name()))
 			continue
 		}
 
-		return &MDConversionResult{
-			Markdown:        md,
-			PipelineRelease: pipeline,
-
-			// TODO jvallesm: read the length and position data from the
-			// pipeline results. First we'll update the conversion method
-			// interface and implement the changes in the clients. After
-			// verifying these are backwards-compatible, we'll implement the
-			// length and position extraction.
-		}, nil
+		// Set the pipeline release used for this conversion
+		result.PipelineRelease = pipeline
+		return result, nil
 	}
 
 	return nil, fmt.Errorf("conversion pipelines didn't produce any result")
 }
+
+// ProtoListToStrings returns a proto list of strings as a string slice. The empty
+// elements will be removed. A suffix can be passed, which will be appended to
+// all the elements but the last one. This will produce the same effect than
+// strings.Join(asStrings, suffix) in upstream code, but allows for page
+// delimiter extraction before that step.
+func ProtoListToStrings(list *structpb.ListValue, suffix string) []string {
+	values := list.GetValues()
+	asStrings := make([]string, 0, len(values))
+	for i, v := range values {
+		s := v.GetStringValue()
+		if s == "" {
+			continue
+		}
+
+		if len(suffix) > 0 && !strings.HasSuffix(s, suffix) && i < len(values)-1 {
+			s = s + suffix
+		}
+
+		asStrings = append(asStrings, s)
+	}
+
+	return asStrings
+}
+
+// PositionDataFromPages extracts the page delimiters from a list of pages.
+func PositionDataFromPages(pages []string) *repository.PositionData {
+	if len(pages) == 0 {
+		return nil
+	}
+
+	var offset uint32
+	positionData := &repository.PositionData{
+		PageDelimiters: make([]uint32, len(pages)),
+	}
+
+	for i, page := range pages {
+		offset += uint32(len([]rune(page)))
+		positionData.PageDelimiters[i] = offset
+	}
+
+	return positionData
+}