@@ -17,6 +17,8 @@ import (
17
17
"testing"
18
18
"time"
19
19
20
+ "github.com/cortexproject/cortex/pkg/storage/tsdb"
21
+
20
22
"github.com/prometheus/common/model"
21
23
"github.com/prometheus/prometheus/model/labels"
22
24
"github.com/prometheus/prometheus/model/rulefmt"
@@ -576,8 +578,8 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
576
578
require .Equal (t , 200 , res .StatusCode )
577
579
}
578
580
579
- totalQueries , err := ruler . SumMetrics ([] string { "cortex_ruler_queries_total" } )
580
- require . NoError ( t , err )
581
+ matcher := labels . MustNewMatcher ( labels . MatchEqual , "user" , user )
582
+ var totalQueries = [] float64 { 0 }
581
583
582
584
// Verify that user-failures don't increase cortex_ruler_queries_failed_total
583
585
for groupName , expression := range map [string ]string {
@@ -601,7 +603,7 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
601
603
require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
602
604
603
605
// But these failures were not reported as "failed queries"
604
- sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
606
+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" }, e2e . WithLabelMatchers ( matcher ) )
605
607
require .NoError (t , err )
606
608
require .Equal (t , float64 (0 ), sum [0 ])
607
609
@@ -612,7 +614,7 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
612
614
require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .SkipMissingMetrics ))
613
615
614
616
// Check that cortex_ruler_queries_total went up since last test.
615
- newTotalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" })
617
+ newTotalQueries , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_total" }, e2e . WithLabelMatchers ( matcher ) )
616
618
require .NoError (t , err )
617
619
require .Greater (t , newTotalQueries [0 ], totalQueries [0 ])
618
620
@@ -637,15 +639,119 @@ func TestRulerMetricsForInvalidQueries(t *testing.T) {
637
639
require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
638
640
639
641
// Still no failures.
640
- sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" })
642
+ sum , err := ruler .SumMetrics ([]string {"cortex_ruler_queries_failed_total" }, e2e . WithLabelMatchers ( matcher ) )
641
643
require .NoError (t , err )
642
644
require .Equal (t , float64 (0 ), sum [0 ])
643
645
644
646
// Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_queries_failed_total failures.
645
647
require .NoError (t , s .Stop (ingester ))
646
648
647
649
// We should start getting "real" failures now.
648
- require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_queries_failed_total" }))
650
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_queries_failed_total" }, e2e .WithLabelMatchers (matcher )))
651
+ })
652
+ }
653
+
654
+ func TestRulerMetricsWhenIngesterFails (t * testing.T ) {
655
+ s , err := e2e .NewScenario (networkName )
656
+ require .NoError (t , err )
657
+ defer s .Close ()
658
+
659
+ // Start dependencies.
660
+ consul := e2edb .NewConsul ()
661
+ minio := e2edb .NewMinio (9000 , bucketName , rulestoreBucketName )
662
+ require .NoError (t , s .StartAndWaitReady (consul , minio ))
663
+
664
+ const blockRangePeriod = 2 * time .Second
665
+ // Configure the ruler.
666
+ flags := mergeFlags (
667
+ BlocksStorageFlags (),
668
+ RulerFlags (),
669
+ map [string ]string {
670
+ "-blocks-storage.tsdb.block-ranges-period" : blockRangePeriod .String (),
671
+ "-blocks-storage.tsdb.ship-interval" : "1s" ,
672
+ "-blocks-storage.bucket-store.sync-interval" : "1s" ,
673
+ "-blocks-storage.bucket-store.index-cache.backend" : tsdb .IndexCacheBackendInMemory ,
674
+ "-blocks-storage.tsdb.retention-period" : ((blockRangePeriod * 2 ) - 1 ).String (),
675
+
676
+ // Enable the bucket index so we can skip the initial bucket scan.
677
+ "-blocks-storage.bucket-store.bucket-index.enabled" : "false" ,
678
+ // Evaluate rules often, so that we don't need to wait for metrics to show up.
679
+ "-ruler.evaluation-interval" : "2s" ,
680
+ "-ruler.poll-interval" : "2s" ,
681
+ // No delay
682
+ "-ruler.evaluation-delay-duration" : "0" ,
683
+
684
+ // We run single ingester only, no replication.
685
+ "-distributor.replication-factor" : "1" ,
686
+
687
+ // Very low limit so that ruler hits it.
688
+ "-querier.max-fetched-chunks-per-query" : "15" ,
689
+ "-querier.query-store-after" : (1 * time .Second ).String (),
690
+ "-querier.query-ingesters-within" : (2 * time .Second ).String (),
691
+ },
692
+ )
693
+
694
+ const namespace = "test"
695
+ const user = "user"
696
+
697
+ storeGateway := e2ecortex .NewStoreGateway ("store-gateway-1" , e2ecortex .RingStoreConsul , consul .NetworkHTTPEndpoint (), flags , "" )
698
+
699
+ flags = mergeFlags (flags , map [string ]string {
700
+ "-querier.store-gateway-addresses" : storeGateway .NetworkGRPCEndpoint (),
701
+ })
702
+
703
+ distributor := e2ecortex .NewDistributor ("distributor" , e2ecortex .RingStoreConsul , consul .NetworkHTTPEndpoint (), flags , "" )
704
+ ruler := e2ecortex .NewRuler ("ruler" , consul .NetworkHTTPEndpoint (), flags , "" )
705
+ ingester := e2ecortex .NewIngester ("ingester" , e2ecortex .RingStoreConsul , consul .NetworkHTTPEndpoint (), flags , "" )
706
+ require .NoError (t , s .StartAndWaitReady (distributor , ingester , ruler , storeGateway ))
707
+
708
+ // Wait until both the distributor and ruler have updated the ring. The querier will also watch
709
+ // the store-gateway ring if blocks sharding is enabled.
710
+ require .NoError (t , distributor .WaitSumMetrics (e2e .Equals (512 ), "cortex_ring_tokens_total" ))
711
+ require .NoError (t , ruler .WaitSumMetrics (e2e .Equals (1024 ), "cortex_ring_tokens_total" ))
712
+
713
+ c , err := e2ecortex .NewClient (distributor .HTTPEndpoint (), "" , "" , ruler .HTTPEndpoint (), user )
714
+ require .NoError (t , err )
715
+
716
+ matcher := labels .MustNewMatcher (labels .MatchEqual , "user" , user )
717
+ expression := "absent(sum_over_time(metric{}[2s] offset 1h))"
718
+
719
+ // Now let's upload a non-failing rule, and make sure that it works.
720
+ t .Run ("real_error" , func (t * testing.T ) {
721
+ const groupName = "good_rule"
722
+
723
+ var ruleEvalCount float64
724
+ ruleGroup := ruleGroupWithRule (groupName , "rule" , expression )
725
+ ruleGroup .Interval = 2
726
+ require .NoError (t , c .SetRuleGroup (ruleGroup , namespace ))
727
+ m := ruleGroupMatcher (user , namespace , groupName )
728
+
729
+ // Wait until ruler has loaded the group.
730
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (1 ), []string {"cortex_prometheus_rule_group_rules" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
731
+
732
+ // Wait until rule group has tried to evaluate the rule, and succeeded.
733
+ ruleEvalCount ++
734
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (ruleEvalCount ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
735
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_prometheus_rule_evaluation_failures_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
736
+
737
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_write_requests_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
738
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_ruler_write_requests_failed_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
739
+
740
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_ruler_queries_failed_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
741
+
742
+ // Wait until the TSDB head is compacted and shipped to the storage.
743
+ // The shipped block contains the 1st series, while the 2ns series in the head.
744
+ require .NoError (t , ingester .WaitSumMetrics (e2e .Equals (1 ), "cortex_ingester_shipper_uploads_total" ))
745
+
746
+ // Now let's stop ingester, and recheck metrics. This should increase cortex_ruler_write_requests_failed_total failures.
747
+ require .NoError (t , s .Stop (ingester ))
748
+ ruleEvalCount ++
749
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (ruleEvalCount ), []string {"cortex_prometheus_rule_evaluations_total" }, e2e .WithLabelMatchers (m ), e2e .WaitMissingMetrics ))
750
+
751
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .Equals (0 ), []string {"cortex_ruler_queries_failed_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
752
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (2 ), []string {"cortex_ruler_write_requests_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
753
+
754
+ require .NoError (t , ruler .WaitSumMetricsWithOptions (e2e .GreaterOrEqual (1 ), []string {"cortex_ruler_write_requests_failed_total" }, e2e .WithLabelMatchers (matcher ), e2e .WaitMissingMetrics ))
649
755
})
650
756
}
651
757
0 commit comments