Intern IndexFieldCapabilities Type String on Read (#76405)

original-brownbear · web-flow · commit 6d20dbc6db26 · 2021-09-15T13:00:24.000+02:00
In case of handling a large number of these messages, i.e. when fetching field caps
for many indices (and/or those indices contain lots of fields) the type string is repeated
many times over. As these strings are already interned because they are constants, taking
the performance hit of interning them on deserialization seems a reasonable trade-off
for the benefit of saving a non-trivial amount of memory for large clusters as well as
speeding up `org.elasticsearch.action.fieldcaps.TransportFieldCapabilitiesAction#merge`
which uses these strings in map lookup and will run significantly faster with interned strings
instead of fresh strings that do not have their hash values cached yet.
diff --git a/server/src/main/java/org/elasticsearch/action/fieldcaps/IndexFieldCapabilities.java b/server/src/main/java/org/elasticsearch/action/fieldcaps/IndexFieldCapabilities.java
@@ -11,6 +11,7 @@
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.common.util.StringLiteralDeduplicator;
 
 import java.io.IOException;
 import java.util.Map;
@@ -21,6 +22,8 @@
  */
 public class IndexFieldCapabilities implements Writeable {
 
+    private static final StringLiteralDeduplicator typeStringDeduplicator = new StringLiteralDeduplicator();
+
     private final String name;
     private final String type;
     private final boolean isMetadatafield;
@@ -50,7 +53,7 @@ public class IndexFieldCapabilities implements Writeable {
 
     IndexFieldCapabilities(StreamInput in) throws IOException {
         this.name = in.readString();
-        this.type = in.readString();
+        this.type = typeStringDeduplicator.deduplicate(in.readString());
         this.isMetadatafield = in.readBoolean();
         this.isSearchable = in.readBoolean();
         this.isAggregatable = in.readBoolean();
diff --git a/server/src/main/java/org/elasticsearch/common/util/StringLiteralDeduplicator.java b/server/src/main/java/org/elasticsearch/common/util/StringLiteralDeduplicator.java
@@ -0,0 +1,45 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+package org.elasticsearch.common.util;
+
+import org.apache.logging.log4j.LogManager;
+import org.apache.logging.log4j.Logger;
+import org.elasticsearch.common.util.concurrent.ConcurrentCollections;
+
+import java.util.Map;
+
+/**
+ * A cache in front of Java's string interning. This method assumes that it is only called with strings that are already part of the
+ * JVM's string pool so that interning them does not grow the pool. Calling it with strings not in the interned string pool is not
+ * advisable as its performance may deteriorate to slower than outright calls to {@link String#intern()}.
+ */
+public final class StringLiteralDeduplicator {
+
+    private static final Logger logger = LogManager.getLogger(StringLiteralDeduplicator.class);
+
+    private static final int MAX_SIZE = 1000;
+
+    private final Map<String, String> map = ConcurrentCollections.newConcurrentMapWithAggressiveConcurrency();
+
+    public StringLiteralDeduplicator() {
+    }
+
+    public String deduplicate(String string) {
+        final String res = map.get(string);
+        if (res != null) {
+            return res;
+        }
+        final String interned = string.intern();
+        if (map.size() > MAX_SIZE) {
+            map.clear();
+            logger.debug("clearing intern cache");
+        }
+        map.put(interned, interned);
+        return interned;
+    }
+}