Skip to content

[ML] High memory usage for forecasting #26

Closed
@hendrikmuhs

Description

@hendrikmuhs

Using the bits of current master, forecast lets the rss memory of the autodetect process grow fast and beyond reasonable limits:

screenshot_20180327_144114

I could not reproduce this issue using 6.2.3. I will try 6.3 next, should I reproduce the same issue with it, this becomes a showstopper.

Dataset: cloudwatch2016_snapshot, job config:

{
  "job_id": "c1",
  "job_type": "anomaly_detector",
  "job_version": "6.3.0",
  "description": "",
  "create_time": 1522160336316,
  "established_model_memory": 10448586,
  "analysis_config": {
    "bucket_span": "1m",
    "detectors": [
      {
        "detector_description": "sum(NetworkOut) by instance",
        "function": "sum",
        "field_name": "NetworkOut",
        "by_field_name": "instance",
        "detector_index": 0
      },
      {
        "detector_description": "sum(NetworkIn) by instance",
        "function": "sum",
        "field_name": "NetworkIn",
        "by_field_name": "instance",
        "detector_index": 1
      },
      {
        "detector_description": "mean(DiskReadOps) by instance",
        "function": "mean",
        "field_name": "DiskReadOps",
        "by_field_name": "instance",
        "detector_index": 2
      },
      {
        "detector_description": "mean(DiskWriteOps) by instance",
        "function": "mean",
        "field_name": "DiskWriteOps",
        "by_field_name": "instance",
        "detector_index": 3
      },
      {
        "detector_description": "mean(CPUUtilization) by instance",
        "function": "mean",
        "field_name": "CPUUtilization",
        "by_field_name": "instance",
        "detector_index": 4
      }
    ],
    "influencers": [
      "region",
      "instance",
      "sourcetype.keyword"
    ]
  },
  "analysis_limits": {
    "model_memory_limit": "1024mb",
    "categorization_examples_limit": 4
  },
  "data_description": {
    "time_field": "@timestamp",
    "time_format": "epoch_ms"
  },
  "model_snapshot_retention_days": 1,
  "results_index_name": "shared",
  "data_counts": {
    "job_id": "c1",
    "processed_record_count": 1793481,
    "processed_field_count": 0,
    "input_bytes": 226665140,
    "input_field_count": 6665535,
    "invalid_date_count": 0,
    "missing_field_count": 7682313,
    "out_of_order_timestamp_count": 0,
    "empty_bucket_count": 10,
    "sparse_bucket_count": 0,
    "bucket_count": 20971,
    "earliest_record_timestamp": 1477612800000,
    "latest_record_timestamp": 1478871060000,
    "last_data_time": 1522160865030,
    "latest_empty_bucket_timestamp": 1478870940000,
    "input_record_count": 1793481
  },
  "model_size_stats": {
    "job_id": "c1",
    "result_type": "model_size_stats",
    "model_bytes": 10654082,
    "total_by_field_count": 387,
    "total_over_field_count": 0,
    "total_partition_field_count": 6,
    "bucket_allocation_failures_count": 0,
    "memory_status": "ok",
    "log_time": 1522160865000,
    "timestamp": 1478871000000
  },
  "datafeed_config": {
    "datafeed_id": "datafeed-c1",
    "job_id": "c1",
    "query_delay": "93648ms",
    "indices": [
      "cloudwatch*"
    ],
    "types": [
      
    ],
    "query": {
      "match_all": {
        "boost": 1
      }
    },
    "scroll_size": 1000,
    "chunking_config": {
      "mode": "auto"
    },
    "state": "stopped",
    "node": {
      "id": "ZW77aCkdQ264O8788V35mg",
      "name": "ZW77aCk",
      "ephemeral_id": "R54MM5yVTTyif78FZC0TKw",
      "transport_address": "127.0.0.1:9300",
      "attributes": {
        "ml.machine_memory": "33580257280",
        "ml.max_open_jobs": "20",
        "ml.enabled": "true"
      }
    }
  },
  "state": "opened",
  "node": {
    "id": "ZW77aCkdQ264O8788V35mg",
    "name": "ZW77aCk",
    "ephemeral_id": "R54MM5yVTTyif78FZC0TKw",
    "transport_address": "127.0.0.1:9300",
    "attributes": {
      "ml.machine_memory": "33580257280",
      "ml.max_open_jobs": "20",
      "ml.enabled": "true"
    }
  },
  "open_time": "691s"
}

To reproduce:

  • create a setup with the dataset above and a job config similar to the above, e.g. 'c1'
  • simply feed all data in, should be 1,793,481 processed records
  • call
POST _xpack/ml/anomaly_detectors/c1/_open
POST _xpack/ml/anomaly_detectors/c1/_forecast?duration=10d
  • watch rss memory, e.g (replace {pid}).:
while true
do
  ps -o rss= {pid} 2>&1 | tee -a logfile
  sleep 10
done

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions