apache · nfsantos · Sep 16, 2024 · Sep 13, 2024 · Sep 13, 2024 · Sep 13, 2024
diff --git a/...ava/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java b/...ava/org/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/ConfigHelper.java
@@ -18,9 +18,14 @@
  */
 package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined;
 
+import org.apache.commons.lang3.StringUtils;
 import org.slf4j.Logger;
 import org.slf4j.LoggerFactory;
 
+import java.util.Arrays;
+import java.util.List;
+import java.util.stream.Collectors;
+
 public class ConfigHelper {
     private static final Logger LOG = LoggerFactory.getLogger(ConfigHelper.class);
 
@@ -48,4 +53,18 @@ public static boolean getSystemPropertyAsBoolean(String name, boolean defaultVal
         LOG.info("Config {}={}", name, value);
         return value;
     }
+
+    /**
+     * white space at the start/end of the string or at the start/end of the parts delimited by separator are trimmed
+     */
+    public static List<String> getSystemPropertyAsStringList(String name, String defaultValue, char separator) {
+        String result = System.getProperty(name, defaultValue);
+        List<String> parts = splitString(result, separator);
+        LOG.info("Config {}={}", name, parts);
+        return parts;
+    }
+
+    private static List<String> splitString(String str, char separator) {
+        return str.isBlank() ? List.of() : Arrays.stream(StringUtils.split(str, separator)).map(String::trim).collect(Collectors.toList());
+    }
 }
diff --git a/...rg/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/NodeDocumentCodec.java b/...rg/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/NodeDocumentCodec.java
@@ -18,7 +18,8 @@
  */
 package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined;
 
-import org.apache.commons.lang3.mutable.MutableInt;
+import org.apache.commons.lang3.mutable.MutableLong;
+import org.apache.jackrabbit.oak.commons.IOUtils;
 import org.apache.jackrabbit.oak.plugins.document.Collection;
 import org.apache.jackrabbit.oak.plugins.document.Document;
 import org.apache.jackrabbit.oak.plugins.document.NodeDocument;
@@ -35,9 +36,15 @@
 import org.bson.codecs.DecoderContext;
 import org.bson.codecs.EncoderContext;
 import org.bson.codecs.configuration.CodecRegistry;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
 
+import java.util.List;
 import java.util.SortedMap;
 import java.util.TreeMap;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+import java.util.stream.Collectors;
 
 /**
  * Custom codec for MongoDB to transform a stream of BSON tokens into a NodeDocument. This custom codec provides two
@@ -49,44 +56,111 @@
  *   <li>Allows estimating the size of the document while reading it, which will have a negligible overhead (as compared
  *   with doing an additional traverse of the object structure to compute the size).</li>
  * </ul>
- *
+ * <p>
  * This class must be thread-safe, Mongo uses a single coded implementation across multiple threads.
- *
  */
 public class NodeDocumentCodec implements Codec<NodeDocument> {
+    private final static Logger LOG = LoggerFactory.getLogger(NodeDocumentCodec.class);
+
+    public static final String OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_FILTERED_PATH = "oak.indexer.pipelined.nodeDocument.filter.filteredPath";
+    public static final String OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_SUFFIXES_TO_SKIP = "oak.indexer.pipelined.nodeDocument.filter.suffixesToSkip";
+    private final String filteredPath = ConfigHelper.getSystemPropertyAsString(OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_FILTERED_PATH, "");
+    private final List<String> suffixesToSkip = ConfigHelper.getSystemPropertyAsStringList(OAK_INDEXER_PIPELINED_NODE_DOCUMENT_FILTER_SUFFIXES_TO_SKIP, "", ';');
+
     // The estimated size is stored in the NodeDocument itself
     public final static String SIZE_FIELD = "_ESTIMATED_SIZE_";
+
+    private static class NodeDocumentDecoderContext {
+        long docsDecoded = 0;
+        long dataDownloaded = 0;
+        int estimatedSizeOfCurrentObject = 0;
+    }
+
+    private final NodeDocument emptyNodeDocument;
+
     private final MongoDocumentStore store;
     private final Collection<NodeDocument> collection;
     private final BsonTypeCodecMap bsonTypeCodecMap;
     private final DecoderContext decoderContext = DecoderContext.builder().build();
-
     private final Codec<String> stringCoded;
     private final Codec<Long> longCoded;
     private final Codec<Boolean> booleanCoded;
 
+    private final NodeDocumentFilter nodeDocumentFilter = new NodeDocumentFilter(filteredPath, suffixesToSkip);
+
+    // Statistics
+    private final AtomicLong totalDocsDecoded = new AtomicLong(0);
+    private final AtomicLong totalDataDownloaded = new AtomicLong(0);
+    private final ThreadLocal<NodeDocumentDecoderContext> perThreadContext = ThreadLocal.withInitial(NodeDocumentDecoderContext::new);
+
     public NodeDocumentCodec(MongoDocumentStore store, Collection<NodeDocument> collection, CodecRegistry defaultRegistry) {
         this.store = store;
         this.collection = collection;
         this.bsonTypeCodecMap = new BsonTypeCodecMap(new BsonTypeClassMap(), defaultRegistry);
+        this.emptyNodeDocument = collection.newDocument(store);
         // Retrieve references to the most commonly used codecs, to avoid the map lookup in the common case
         this.stringCoded = (Codec<String>) bsonTypeCodecMap.get(BsonType.STRING);
         this.longCoded = (Codec<Long>) bsonTypeCodecMap.get(BsonType.INT64);
         this.booleanCoded = (Codec<Boolean>) bsonTypeCodecMap.get(BsonType.BOOLEAN);
     }
 
+    /**
+     * Skipping over values in the BSON file is faster than reading them. Skipping is done by advancing a pointer in
+     * an internal buffer, while reading requires converting them to a Java data type (typically String).
+     */
+    private void skipUntilEndOfDocument(BsonReader reader) {
+        while (reader.readBsonType() != BsonType.END_OF_DOCUMENT) {
+            reader.skipName();
+            reader.skipValue();
+        }
+        reader.readEndDocument();
+    }
+
     @Override
     public NodeDocument decode(BsonReader reader, DecoderContext decoderContext) {
         NodeDocument nodeDocument = collection.newDocument(store);
-        MutableInt estimatedSizeOfCurrentObject = new MutableInt(0);
+        NodeDocumentDecoderContext threadLocalContext = perThreadContext.get();
+        threadLocalContext.estimatedSizeOfCurrentObject = 0;
         reader.readStartDocument();
         while (reader.readBsonType() != BsonType.END_OF_DOCUMENT) {
             String fieldName = reader.readName();
-            Object value = readValue(reader, fieldName, estimatedSizeOfCurrentObject);
+            Object value = readValue(reader, fieldName, threadLocalContext);
+            // Once we read the _id or the _path, apply the filter
+            if (!nodeDocumentFilter.isFilteringDisabled()
+                    && (fieldName.equals(NodeDocument.ID) || fieldName.equals(NodeDocument.PATH))
+                    && (value instanceof String)) {
+                // value should always be non-null and of type String, but we do not want the filter to ever break the
+                // downlaoder as filtering is best-effort and just a performance optimization. So we check anyway that
+                // value is what we expect it to be, and if not, just skip trying to filter.
+                if (nodeDocumentFilter.shouldSkip(fieldName, (String) value)) {
+                    skipUntilEndOfDocument(reader);
+                    // The Mongo driver requires us to return a document. To indicate that the document should be skipped,
+                    // we return an empty document. The logic reading from the Mongo cursor can then check if the _id of
+                    // the document is null, which indicates that the document should be skipped.
+                    return emptyNodeDocument;
+                }
+            }
             nodeDocument.put(fieldName, value);
         }
         reader.readEndDocument();
-        nodeDocument.put(SIZE_FIELD, estimatedSizeOfCurrentObject.toInteger());
+        threadLocalContext.docsDecoded++;
+        threadLocalContext.dataDownloaded += threadLocalContext.estimatedSizeOfCurrentObject;
+        long docsDecodedLocal = totalDocsDecoded.incrementAndGet();
+        long dataDownloadedLocal = totalDataDownloaded.addAndGet(threadLocalContext.estimatedSizeOfCurrentObject);
+        if (docsDecodedLocal % 500_000 == 0) {
+            ConcurrentHashMap<String, MutableLong> filteredSuffixes = nodeDocumentFilter.getFilteredSuffixesCounts();
+            long totalDocumentsFiltered = filteredSuffixes.values().stream().mapToLong(MutableLong::longValue).sum();
+            String filteredRenditionsString = filteredSuffixes.entrySet().stream()
+                    .sorted((e1, e2) -> e2.getValue().compareTo(e1.getValue()))
+                    .limit(10)
+                    .map(e -> e.getKey() + "=" + e.getValue())
+                    .collect(Collectors.joining(", ", "{", "}"));
+            LOG.info("docsDecodedByThread: {}, dataDownloadedByThread: {}, docsDecodedTotal: {}, dataDownloadedTotal: {}, docsSkippedTotal {}, filteredRenditionsTotal (top 10): {}",
+                    threadLocalContext.docsDecoded, IOUtils.humanReadableByteCountBin(threadLocalContext.dataDownloaded),
+                    totalDocsDecoded, IOUtils.humanReadableByteCountBin(dataDownloadedLocal),
+                    totalDocumentsFiltered, filteredRenditionsString);
+        }
+        nodeDocument.put(SIZE_FIELD, threadLocalContext.estimatedSizeOfCurrentObject);
         return nodeDocument;
     }
 
@@ -100,7 +174,7 @@ public Class<NodeDocument> getEncoderClass() {
         return NodeDocument.class;
     }
 
-    private Object readValue(BsonReader reader, String fieldName, MutableInt estimatedSizeOfCurrentObject) {
+    private Object readValue(BsonReader reader, String fieldName, NodeDocumentDecoderContext threadContext) {
         BsonType bsonType = reader.getCurrentBsonType();
         Object value;
         int valSize;
@@ -115,7 +189,7 @@ private Object readValue(BsonReader reader, String fieldName, MutableInt estimat
                 valSize = 16;
                 break;
             case DOCUMENT:
-                value = readDocument(reader, estimatedSizeOfCurrentObject);
+                value = readDocument(reader, threadContext);
                 valSize = 0; // the size is updated by the recursive calls inside readDocument
                 break;
             case BOOLEAN:
@@ -141,16 +215,16 @@ private Object readValue(BsonReader reader, String fieldName, MutableInt estimat
                 }
                 break;
         }
-        estimatedSizeOfCurrentObject.add(16 + fieldName.length() + valSize);
+        threadContext.estimatedSizeOfCurrentObject += 16 + fieldName.length() + valSize;
         return value;
     }
 
-    private SortedMap<Revision, Object> readDocument(BsonReader reader, MutableInt estimatedSizeOfCurrentObject) {
+    private SortedMap<Revision, Object> readDocument(BsonReader reader, NodeDocumentDecoderContext threadContext) {
         TreeMap<Revision, Object> map = new TreeMap<>(StableRevisionComparator.REVERSE);
         reader.readStartDocument();
         while (reader.readBsonType() != BsonType.END_OF_DOCUMENT) {
             String fieldName = reader.readName();
-            Object value = readValue(reader, fieldName, estimatedSizeOfCurrentObject);
+            Object value = readValue(reader, fieldName, threadContext);
             map.put(Revision.fromString(fieldName), value);
         }
         reader.readEndDocument();

diff --git a/...g/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/NodeDocumentFilter.java b/...g/apache/jackrabbit/oak/index/indexer/document/flatfile/pipelined/NodeDocumentFilter.java
@@ -0,0 +1,120 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+package org.apache.jackrabbit.oak.index.indexer.document.flatfile.pipelined;
+
+import org.apache.commons.lang3.mutable.MutableLong;
+import org.apache.jackrabbit.oak.plugins.document.NodeDocument;
+import org.slf4j.Logger;
+import org.slf4j.LoggerFactory;
+
+import java.util.List;
+import java.util.concurrent.ConcurrentHashMap;
+import java.util.concurrent.atomic.AtomicLong;
+
+/**
+ * Implements a filter to decide if a given Mongo document should be processed or ignored based on its path. The filter has
+ * two configuration parameters:
+ *
+ * <ul>
+ * <li> filteredPath - The path where the filter is applied. Only the documents inside this path will be considered for filtering.
+ *   Documents in other paths will all be accepted.
+ * <li> suffixesToSkip - A list of suffixes to filter. That is, any document whose path ends in one of these suffixes will
+ *   be filtered.
+ * </ul>
+ * <p>
+ * The intent of this filter is to be applied as close as possible to the download/decoding of the documents from Mongo,
+ * in order to filter unnecessary documents early and avoid spending resources processing them.
+ */
+public class NodeDocumentFilter {
+    private static final Logger LOG = LoggerFactory.getLogger(NodeDocumentFilter.class);
+
+    private final String filteredPath;
+    private final List<String> suffixesToSkip;
+
+    private final boolean filteringDisabled;
+
+    // Statistics
+    private final AtomicLong skippedFields = new AtomicLong(0);
+    private final AtomicLong longPathSkipped = new AtomicLong(0);
+    private final ConcurrentHashMap<String, MutableLong> filteredSuffixesCounts = new ConcurrentHashMap<>();
+
+    public NodeDocumentFilter(String filteredPath, List<String> suffixesToSkip) {
+        this.filteredPath = filteredPath;
+        this.suffixesToSkip = suffixesToSkip;
+        this.filteringDisabled = filteredPath.isBlank() || suffixesToSkip.isEmpty();
+        if (filteringDisabled) {
+            LOG.info("Node document filtering disabled.");
+        }
+    }
+
+    /**
+     * @param fieldName     Name of the Mongo document field. Expected to be either  _id or _path
+     * @param idOrPathValue The value of the field
+     * @return true if the document should be skipped, false otherwise
+     */
+    public boolean shouldSkip(String fieldName, String idOrPathValue) {
+        if (filteringDisabled) {
+            return false;
+        }
+        // Check if the NodeDocument should be considered for filtering, that is, if it starts with includePath.
+        // If the value is for an _id, then we must find the start of the path section, that is, the position of the first
+        // slash (3:/foo/bar/baz). If the value given is for a path, then it already contains only the path. In any case,
+        // we look up for the first occurrence of /
+        int idxOfFirstForwardSlash = idOrPathValue.indexOf('/');
+        if (idxOfFirstForwardSlash < 0) {
+            LOG.warn("Invalid field. {} = {}", fieldName, idOrPathValue);
+            return false;
+        }
+        if (idOrPathValue.startsWith(filteredPath, idxOfFirstForwardSlash)) {
+            // Match the include path. Check if it ends with any of the suffixes to skip.
+            for (String suffix : suffixesToSkip) {
+                if (idOrPathValue.endsWith(suffix)) {
+                    // This node document should be skipped.
+                    filteredSuffixesCounts.computeIfAbsent(suffix, k -> new MutableLong(0)).increment();
+                    long skippedSoFar = skippedFields.incrementAndGet();
+                    if (fieldName.equals(NodeDocument.PATH)) {
+                        longPathSkipped.incrementAndGet();
+                    }
+                    if (skippedSoFar % 50_000 == 0) {
+                        LOG.info("skippedSoFar: {}. Long path: {}, Doc: {}={}", skippedSoFar, longPathSkipped.get(), fieldName, idOrPathValue);
+                    }
+                    return true;
+                }
+            }
+        }
+        return false;
+    }
+
+
+    public boolean isFilteringDisabled() {
+        return filteringDisabled;
+    }
+
+    public long getSkippedFields() {
+        return skippedFields.get();
+    }
+
+    public long getLongPathSkipped() {
+        return longPathSkipped.get();
+    }
+
+    public ConcurrentHashMap<String, MutableLong> getFilteredSuffixesCounts() {
+        return filteredSuffixesCounts;
+    }
+}
diff --git a/.../jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java b/.../jackrabbit/oak/index/indexer/document/flatfile/pipelined/PipelinedMongoDownloadTask.java
@@ -786,7 +786,11 @@ void download(FindIterable<NodeDocument> mongoIterable) throws InterruptedExcept
                 try {
                     while (cursor.hasNext()) {
                         NodeDocument next = cursor.next();
+                        // If the id is not set, then the document was filtered by NodeDocumentFilter and should be ignored
                         String id = next.getId();
+                        if (id == null) {
+                            continue;
+                        }
                         // All the Mongo queries in this class have a requirement on the _modified field, so the
                         // documents downloaded will all have the field defined.
                         this.nextLastModified = next.getModified();
@@ -796,7 +800,7 @@ void download(FindIterable<NodeDocument> mongoIterable) throws InterruptedExcept
                         this.lastIdDownloaded = id;
                         this.documentsDownloadedTotal++;
                         downloadStatics.incrementDocumentsDownloadedTotal();
-                        if (this.documentsDownloadedTotal % 20_000 == 0) {
+                        if (this.documentsDownloadedTotal % 50_000 == 0) {
                             reportProgress(id);
                         }
                         TRAVERSAL_LOG.trace(id);