save sort data meta

pingcap · ti-chi-bot · Sep 19, 2023 · Sep 12, 2023 · Sep 12, 2023 · Sep 12, 2023
commit c4f5647e4fc1a6a17399860b5e75d680501d66fc
diff --git a/br/pkg/lightning/backend/external/util.go b/br/pkg/lightning/backend/external/util.go
@@ -241,3 +241,79 @@ func GetMaxOverlapping(points []Endpoint) int {
 	}
 	return int(maxWeight)
 }
+
+// SortedDataMeta is the meta of sorted data.
+type SortedDataMeta struct {
+	MinKey      []byte   `json:"min_key"`
+	MaxKey      []byte   `json:"max_key"`
+	TotalKVSize uint64   `json:"total_kv_size"`
+	DataFiles   []string `json:"data_files"`
+	StatFiles   []string `json:"stat_files"`
+}
+
+// NewSortedDataMeta creates a SortedDataMeta from a WriterSummary.
+func NewSortedDataMeta(summary *WriterSummary) *SortedDataMeta {
+	meta := &SortedDataMeta{
+		MinKey:      summary.Min.Clone(),
+		MaxKey:      summary.Max.Clone(),
+		TotalKVSize: summary.TotalSize,
+	}
+	for _, f := range summary.MultipleFilesStats {
+		for _, filename := range f.Filenames {
+			meta.DataFiles = append(meta.DataFiles, filename[0])
+			meta.StatFiles = append(meta.StatFiles, filename[1])
+		}
+	}
+	return meta
+}
+
+// Merge merges the other SortedDataMeta into this one.
+func (m *SortedDataMeta) Merge(other *SortedDataMeta) {
+	m.MinKey = NotNilMin(m.MinKey, other.MinKey)
+	m.MaxKey = NotNilMax(m.MaxKey, other.MaxKey)
+	m.TotalKVSize += other.TotalKVSize
+
+	m.DataFiles = append(m.DataFiles, other.DataFiles...)
+	m.StatFiles = append(m.StatFiles, other.StatFiles...)
+}
+
+// MergeSummary merges the WriterSummary into this SortedDataMeta.
+func (m *SortedDataMeta) MergeSummary(summary *WriterSummary) {
+	m.MinKey = NotNilMin(m.MinKey, summary.Min)
+	m.MaxKey = NotNilMax(m.MaxKey, summary.Max)
+	m.TotalKVSize += summary.TotalSize
+	for _, f := range summary.MultipleFilesStats {
+		for _, filename := range f.Filenames {
+			m.DataFiles = append(m.DataFiles, filename[0])
+			m.StatFiles = append(m.StatFiles, filename[1])
+		}
+	}
+}
+
+// NotNilMin returns the smallest of a and b, ignoring nil values.
-// NotNilMin returns the smallest of a and b, ignoring nil values.
+// NotNilMin returns the smallest of a and b.
-// NotNilMin returns the smallest of a and b, ignoring nil values.
+// NotNilMin returns the smallest of a and b.
+func NotNilMin(a, b []byte) []byte {
+	if len(a) == 0 {
+		return b
+	}
+	if len(b) == 0 {
+		return a
+	}
+	if bytes.Compare(a, b) < 0 {
+		return a
+	}
+	return b
+}
+
+// NotNilMax returns the largest of a and b, ignoring nil values.
+func NotNilMax(a, b []byte) []byte {
+	if len(a) == 0 {
+		return b
+	}
+	if len(b) == 0 {
+		return a
+	}
+	if bytes.Compare(a, b) > 0 {
+		return a
+	}
+	return b
+}
diff --git a/ddl/backfilling_dispatcher.go b/ddl/backfilling_dispatcher.go
@@ -309,12 +309,14 @@ func generateMergeSortPlan(
 				hex.EncodeToString(startKey), hex.EncodeToString(endKey))
 		}
 		m := &BackfillSubTaskMeta{
-			MinKey:         startKey,
-			MaxKey:         endKey,
-			DataFiles:      dataFiles,
-			StatFiles:      statFiles,
+			SortedDataMeta: external.SortedDataMeta{
+				MinKey:      startKey,
+				MaxKey:      endKey,
+				DataFiles:   dataFiles,
+				StatFiles:   statFiles,
+				TotalKVSize: totalSize / uint64(len(instanceIDs)),
+			},
 			RangeSplitKeys: rangeSplitKeys,
-			TotalKVSize:    totalSize / uint64(len(instanceIDs)),
 		}
 		metaBytes, err := json.Marshal(m)
 		if err != nil {
@@ -386,8 +388,8 @@ func getSummaryFromLastStep(
 		}
 		// Skip empty subtask.MinKey/MaxKey because it means
 		// no records need to be written in this subtask.
-		minKey = notNilMin(minKey, subtask.MinKey)
-		maxKey = notNilMax(maxKey, subtask.MaxKey)
+		minKey = external.NotNilMin(minKey, subtask.MinKey)
+		maxKey = external.NotNilMax(maxKey, subtask.MaxKey)
 		totalKVSize += subtask.TotalKVSize
 
 		allDataFiles = append(allDataFiles, subtask.DataFiles...)
@@ -409,31 +411,3 @@ func redactCloudStorageURI(
 	}
 	gTask.Meta = metaBytes
 }
-
-// notNilMin returns the smaller of a and b, ignoring nil values.
-func notNilMin(a, b []byte) []byte {
-	if len(a) == 0 {
-		return b
-	}
-	if len(b) == 0 {
-		return a
-	}
-	if bytes.Compare(a, b) < 0 {
-		return a
-	}
-	return b
-}
-
-// notNilMax returns the larger of a and b, ignoring nil values.
-func notNilMax(a, b []byte) []byte {
-	if len(a) == 0 {
-		return b
-	}
-	if len(b) == 0 {
-		return a
-	}
-	if bytes.Compare(a, b) > 0 {
-		return a
-	}
-	return b
-}
diff --git a/ddl/stage_scheduler.go b/ddl/stage_scheduler.go
@@ -19,6 +19,7 @@ import (
 	"encoding/json"
 
 	"github.com/pingcap/errors"
+	"github.com/pingcap/tidb/br/pkg/lightning/backend/external"
 	"github.com/pingcap/tidb/ddl/ingest"
 	"github.com/pingcap/tidb/disttask/framework/proto"
 	"github.com/pingcap/tidb/disttask/framework/scheduler"
@@ -44,12 +45,8 @@ type BackfillSubTaskMeta struct {
 	StartKey        []byte `json:"start_key"`
 	EndKey          []byte `json:"end_key"`
 
-	DataFiles      []string `json:"data_files"`
-	StatFiles      []string `json:"stat_files"`
-	RangeSplitKeys [][]byte `json:"range_split_keys"`
-	MinKey         []byte   `json:"min_key"`
-	MaxKey         []byte   `json:"max_key"`
-	TotalKVSize    uint64   `json:"total_kv_size"`
+	RangeSplitKeys          [][]byte `json:"range_split_keys"`
+	external.SortedDataMeta `json:",inline"`
 }
 
 // NewBackfillSubtaskExecutor creates a new backfill subtask executor.

diff --git a/disttask/importinto/encode_and_sort_operator.go b/disttask/importinto/encode_and_sort_operator.go
@@ -141,7 +141,7 @@ func newChunkWorker(ctx context.Context, op *encodeAndSortOperator, workerID int
 		indexWriterFn := func(indexID int64) *external.Writer {
 			builder := external.NewWriterBuilder().
 				SetOnCloseFunc(func(summary *external.WriterSummary) {
-					op.sharedVars.addIndexSummary(indexID, summary)
+					op.sharedVars.mergeIndexSummary(indexID, summary)
 				})
 			prefix := path.Join(strconv.Itoa(int(op.taskID)), strconv.Itoa(int(op.subtaskID)))
 			writerID := path.Join("index", strconv.Itoa(int(indexID)), strconv.Itoa(int(workerID)))
@@ -151,7 +151,7 @@ func newChunkWorker(ctx context.Context, op *encodeAndSortOperator, workerID int
 
 		// sorted data kv storage path: /{taskID}/{subtaskID}/data/{workerID}
 		builder := external.NewWriterBuilder().
-			SetOnCloseFunc(op.sharedVars.setDataSummary)
+			SetOnCloseFunc(op.sharedVars.mergeDataSummary)
 		prefix := path.Join(strconv.Itoa(int(op.taskID)), strconv.Itoa(int(op.subtaskID)))
 		writerID := path.Join("data", strconv.Itoa(int(workerID)))
 		writer := builder.Build(op.tableImporter.GlobalSortStore, prefix, writerID)

diff --git a/disttask/importinto/proto.go b/disttask/importinto/proto.go
@@ -70,6 +70,10 @@ type ImportStepMeta struct {
 	// the max id is same among all allocator types for now, since we're using same base, see
 	// NewPanickingAllocators for more info.
 	MaxIDs map[autoid.AllocatorType]int64
+
+	SortedDataMeta *external.SortedDataMeta
+	// SortedIndexMetas is a map from index id to its sorted kv meta.
+	SortedIndexMetas map[int64]*external.SortedDataMeta
 }
 
 // PostProcessStepMeta is the meta of post process step.
@@ -92,21 +96,31 @@ type SharedVars struct {
 	mu       sync.Mutex
 	Checksum *verification.KVChecksum
 
-	SortedDataSummary *external.WriterSummary
-	// SortedIndexSummaries is a map from index id to its sorted kv summary.
-	SortedIndexSummaries map[int64]*external.WriterSummary
+	SortedDataMeta *external.SortedDataMeta
+	// SortedIndexMetas is a map from index id to its sorted kv meta.
+	SortedIndexMetas map[int64]*external.SortedDataMeta
 }
 
-func (sv *SharedVars) setDataSummary(summary *external.WriterSummary) {
+func (sv *SharedVars) mergeDataSummary(summary *external.WriterSummary) {
 	sv.mu.Lock()
 	defer sv.mu.Unlock()
-	sv.SortedDataSummary = summary
+	if sv.SortedDataMeta == nil {
+		sv.SortedDataMeta = external.NewSortedDataMeta(summary)
+		return
+	}
+	sv.SortedDataMeta.MergeSummary(summary)
 }
 
-func (sv *SharedVars) addIndexSummary(indexID int64, summary *external.WriterSummary) {
+func (sv *SharedVars) mergeIndexSummary(indexID int64, summary *external.WriterSummary) {
 	sv.mu.Lock()
 	defer sv.mu.Unlock()
-	sv.SortedIndexSummaries[indexID] = summary
+	meta, ok := sv.SortedIndexMetas[indexID]
+	if !ok {
+		meta = external.NewSortedDataMeta(summary)
+		sv.SortedIndexMetas[indexID] = meta
+		return
+	}
+	meta.MergeSummary(summary)
 }
 
 // importStepMinimalTask is the minimal task of IMPORT INTO.

diff --git a/disttask/importinto/scheduler.go b/disttask/importinto/scheduler.go
@@ -126,12 +126,12 @@ func (s *importStepExecutor) RunSubtask(ctx context.Context, subtask *proto.Subt
 		}
 	}
 	sharedVars := &SharedVars{
-		TableImporter:        s.tableImporter,
-		DataEngine:           dataEngine,
-		IndexEngine:          indexEngine,
-		Progress:             asyncloaddata.NewProgress(false),
-		Checksum:             &verification.KVChecksum{},
-		SortedIndexSummaries: make(map[int64]*external.WriterSummary),
+		TableImporter:    s.tableImporter,
+		DataEngine:       dataEngine,
+		IndexEngine:      indexEngine,
+		Progress:         asyncloaddata.NewProgress(false),
+		Checksum:         &verification.KVChecksum{},
+		SortedIndexMetas: make(map[int64]*external.SortedDataMeta),
 	}
 	s.sharedVars.Store(subtaskMeta.ID, sharedVars)
 
@@ -216,6 +216,8 @@ func (s *importStepExecutor) OnFinished(ctx context.Context, subtask *proto.Subt
 		autoid.AutoIncrementType: allocators.Get(autoid.AutoIncrementType).Base(),
 		autoid.AutoRandomType:    allocators.Get(autoid.AutoRandomType).Base(),
 	}
+	subtaskMeta.SortedDataMeta = sharedVars.SortedDataMeta
+	subtaskMeta.SortedIndexMetas = sharedVars.SortedIndexMetas
 	s.sharedVars.Delete(subtaskMeta.ID)
 	newMeta, err := json.Marshal(subtaskMeta)
 	if err != nil {