Skip to content

BUG: Race condition in YAML metrics file operations causes data loss under concurrent access #1198

@gemy26

Description

@gemy26

Describe the bug
The internal/metrics/yaml.go has a race condition in its create-delete-update operations. When multiple goroutines concurrently create, update, or delete metrics/presets, some operations are lost.

To Reproduce

func TestConcurrentUpdates(t *testing.T) {
	ctx := context.Background()
	tempDir := t.TempDir()
	tempFile := filepath.Join(tempDir, "metrics.yaml")

	yamlrw, err := metrics.NewYAMLMetricReaderWriter(ctx, tempFile)
	assert.NoError(t, err)

	// Create initial empty metrics file
	initialMetrics := &metrics.Metrics{
		MetricDefs: make(map[string]metrics.Metric),
		PresetDefs: make(map[string]metrics.Preset),
	}
	err = yamlrw.WriteMetrics(initialMetrics)
	assert.NoError(t, err)

	// Number of concurrent updates
	numGoroutines := 10
	var wg sync.WaitGroup
	errorChan := make(chan error, numGoroutines)

	// Each goroutine will add a unique metric
	for i := 0; i < numGoroutines; i++ {
		wg.Add(1)
		go func(id int) {
			defer wg.Done()

			metricName := fmt.Sprintf("metric_%d", id)
			testMetric := metrics.Metric{
				Description: fmt.Sprintf("Test metric %d", id),
				SQLs: map[int]string{
					1: fmt.Sprintf("SELECT %d", id),
				},
			}

			// Simulate small random delay
			time.Sleep(time.Millisecond * time.Duration(id%3))

			err := yamlrw.UpdateMetric(metricName, testMetric)
			if err != nil {
				errorChan <- fmt.Errorf("goroutine %d: %w", id, err)
			}
		}(i)
	}

	wg.Wait()
	close(errorChan)

	// Check for errors during updates
	var errors []error
	for err := range errorChan {
		errors = append(errors, err)
		t.Logf("Error during concurrent update: %v", err)
	}

	// ensure all metrics were saved
	finalMetrics, err := yamlrw.GetMetrics()
	assert.NoError(t, err)

	assert.Equal(t, numGoroutines, len(finalMetrics.MetricDefs),
		"Expected %d metrics, but got %d. Some updates were lost due to race condition!",
		numGoroutines, len(finalMetrics.MetricDefs))

	// Print which metrics are missing if test fails
	if len(finalMetrics.MetricDefs) != numGoroutines {
		for i := 0; i < numGoroutines; i++ {
			metricName := fmt.Sprintf("metric_%d", i)
			if _, exists := finalMetrics.MetricDefs[metricName]; !exists {
				t.Logf("LOST UPDATE: %s was not saved", metricName)
			}
		}
	}
}

Expected behavior

  • All 10 concurrent metric updates should be saved
  • No race conditions
  • No data loss under concurrent access

Screenshots

Image

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions