|
63 | 63 | import org.elasticsearch.xpack.core.ml.job.process.autodetect.state.ModelSnapshot;
|
64 | 64 | import org.elasticsearch.xpack.core.ml.notifications.NotificationsIndex;
|
65 | 65 | import org.elasticsearch.xpack.ml.MachineLearning;
|
| 66 | +import org.elasticsearch.xpack.ml.job.persistence.JobResultsPersister; |
66 | 67 | import org.elasticsearch.xpack.ml.job.process.autodetect.BlackHoleAutodetectProcess;
|
67 | 68 | import org.elasticsearch.xpack.ml.support.BaseMlIntegTestCase;
|
68 | 69 | import org.junit.After;
|
@@ -542,7 +543,108 @@ public void testClusterWithTwoMlNodes_RunsDatafeed_GivenOriginalNodeGoesDown() t
|
542 | 543 | assertThat(dataCounts.getProcessedRecordCount(), equalTo(numDocs));
|
543 | 544 | assertThat(dataCounts.getOutOfOrderTimeStampCount(), equalTo(0L));
|
544 | 545 | });
|
| 546 | + } |
| 547 | + |
| 548 | + public void testClusterWithTwoMlNodes_StopsDatafeed_GivenJobFailsOnReassign() throws Exception { |
| 549 | + internalCluster().ensureAtMostNumDataNodes(0); |
| 550 | + logger.info("Starting dedicated master node..."); |
| 551 | + internalCluster().startMasterOnlyNode(); |
| 552 | + logger.info("Starting ml and data node..."); |
| 553 | + internalCluster().startNode(onlyRoles(Set.of(DiscoveryNodeRole.DATA_ROLE, DiscoveryNodeRole.ML_ROLE))); |
| 554 | + logger.info("Starting another ml and data node..."); |
| 555 | + internalCluster().startNode(onlyRoles(Set.of(DiscoveryNodeRole.DATA_ROLE, DiscoveryNodeRole.ML_ROLE))); |
| 556 | + ensureStableCluster(); |
| 557 | + |
| 558 | + // index some datafeed data |
| 559 | + client().admin().indices().prepareCreate("data").setMapping("time", "type=date").get(); |
| 560 | + long numDocs = 80000; |
| 561 | + long now = System.currentTimeMillis(); |
| 562 | + long weekAgo = now - 604800000; |
| 563 | + long twoWeeksAgo = weekAgo - 604800000; |
| 564 | + indexDocs(logger, "data", numDocs, twoWeeksAgo, weekAgo); |
| 565 | + |
| 566 | + String jobId = "test-node-goes-down-while-running-job"; |
| 567 | + String datafeedId = jobId + "-datafeed"; |
| 568 | + |
| 569 | + Job.Builder job = createScheduledJob(jobId); |
| 570 | + PutJobAction.Request putJobRequest = new PutJobAction.Request(job); |
| 571 | + client().execute(PutJobAction.INSTANCE, putJobRequest).actionGet(); |
| 572 | + |
| 573 | + DatafeedConfig config = createDatafeed(datafeedId, job.getId(), Collections.singletonList("data"), TimeValue.timeValueHours(1)); |
| 574 | + PutDatafeedAction.Request putDatafeedRequest = new PutDatafeedAction.Request(config); |
| 575 | + client().execute(PutDatafeedAction.INSTANCE, putDatafeedRequest).actionGet(); |
| 576 | + |
| 577 | + client().execute(OpenJobAction.INSTANCE, new OpenJobAction.Request(job.getId())); |
| 578 | + |
| 579 | + assertBusy(() -> { |
| 580 | + GetJobsStatsAction.Response statsResponse = client().execute( |
| 581 | + GetJobsStatsAction.INSTANCE, |
| 582 | + new GetJobsStatsAction.Request(job.getId()) |
| 583 | + ).actionGet(); |
| 584 | + assertEquals(JobState.OPENED, statsResponse.getResponse().results().get(0).getState()); |
| 585 | + }, 30, TimeUnit.SECONDS); |
| 586 | + |
| 587 | + DiscoveryNode nodeRunningJob = client().execute(GetJobsStatsAction.INSTANCE, new GetJobsStatsAction.Request(job.getId())) |
| 588 | + .actionGet() |
| 589 | + .getResponse() |
| 590 | + .results() |
| 591 | + .get(0) |
| 592 | + .getNode(); |
| 593 | + |
| 594 | + setMlIndicesDelayedNodeLeftTimeoutToZero(); |
| 595 | + |
| 596 | + StartDatafeedAction.Request startDatafeedRequest = new StartDatafeedAction.Request(config.getId(), 0L); |
| 597 | + client().execute(StartDatafeedAction.INSTANCE, startDatafeedRequest).get(); |
| 598 | + |
| 599 | + waitForJobToHaveProcessedAtLeast(jobId, 1000); |
| 600 | + |
| 601 | + // The datafeed should be started |
| 602 | + assertBusy(() -> { |
| 603 | + GetDatafeedsStatsAction.Response statsResponse = client().execute( |
| 604 | + GetDatafeedsStatsAction.INSTANCE, |
| 605 | + new GetDatafeedsStatsAction.Request(config.getId()) |
| 606 | + ).actionGet(); |
| 607 | + assertEquals(DatafeedState.STARTED, statsResponse.getResponse().results().get(0).getDatafeedState()); |
| 608 | + }, 30, TimeUnit.SECONDS); |
| 609 | + |
| 610 | + // Create a problem that will make the job fail when it restarts on a different node |
| 611 | + String snapshotId = "123"; |
| 612 | + ModelSnapshot modelSnapshot = new ModelSnapshot.Builder(jobId).setSnapshotId(snapshotId).setTimestamp(new Date()).build(); |
| 613 | + JobResultsPersister jobResultsPersister = internalCluster().getInstance( |
| 614 | + JobResultsPersister.class, |
| 615 | + internalCluster().getMasterName() |
| 616 | + ); |
| 617 | + jobResultsPersister.persistModelSnapshot(modelSnapshot, WriteRequest.RefreshPolicy.IMMEDIATE, () -> true); |
| 618 | + UpdateJobAction.Request updateJobRequest = UpdateJobAction.Request.internal( |
| 619 | + jobId, |
| 620 | + new JobUpdate.Builder(jobId).setModelSnapshotId(snapshotId).build() |
| 621 | + ); |
| 622 | + client().execute(UpdateJobAction.INSTANCE, updateJobRequest).actionGet(); |
| 623 | + refresh(AnomalyDetectorsIndex.resultsWriteAlias(jobId)); |
| 624 | + |
| 625 | + // Make the job move to a different node |
| 626 | + internalCluster().stopNode(nodeRunningJob.getName()); |
| 627 | + |
| 628 | + // Wait for the job to fail during reassignment |
| 629 | + assertBusy(() -> { |
| 630 | + GetJobsStatsAction.Response statsResponse = client().execute( |
| 631 | + GetJobsStatsAction.INSTANCE, |
| 632 | + new GetJobsStatsAction.Request(job.getId()) |
| 633 | + ).actionGet(); |
| 634 | + assertEquals(JobState.FAILED, statsResponse.getResponse().results().get(0).getState()); |
| 635 | + }, 30, TimeUnit.SECONDS); |
| 636 | + |
| 637 | + // The datafeed should then be stopped |
| 638 | + assertBusy(() -> { |
| 639 | + GetDatafeedsStatsAction.Response statsResponse = client().execute( |
| 640 | + GetDatafeedsStatsAction.INSTANCE, |
| 641 | + new GetDatafeedsStatsAction.Request(config.getId()) |
| 642 | + ).actionGet(); |
| 643 | + assertEquals(DatafeedState.STOPPED, statsResponse.getResponse().results().get(0).getDatafeedState()); |
| 644 | + }, 30, TimeUnit.SECONDS); |
545 | 645 |
|
| 646 | + // Force close the failed job to clean up |
| 647 | + client().execute(CloseJobAction.INSTANCE, new CloseJobAction.Request(jobId).setForce(true)).actionGet(); |
546 | 648 | }
|
547 | 649 |
|
548 | 650 | private void setupJobWithoutDatafeed(String jobId, ByteSizeValue modelMemoryLimit) throws Exception {
|
|
0 commit comments