apache · ranxianglei · Aug 21, 2024 · Sep 22, 2024 · Sep 22, 2024 · Sep 22, 2024
diff --git a/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java b/paimon-common/src/main/java/org/apache/paimon/format/FileFormat.java
@@ -19,6 +19,7 @@
 package org.apache.paimon.format;
 
 import org.apache.paimon.CoreOptions;
+import org.apache.paimon.factories.FactoryUtil;
 import org.apache.paimon.format.FileFormatFactory.FormatContext;
 import org.apache.paimon.options.Options;
 import org.apache.paimon.predicate.Predicate;
@@ -32,7 +33,6 @@
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
-import java.util.ServiceLoader;
 
 /**
  * Factory class which creates reader and writer factories for specific file format.
@@ -88,26 +88,15 @@ public static FileFormat fromIdentifier(String identifier, Options options) {
 
     /** Create a {@link FileFormat} from format identifier and format options. */
     public static FileFormat fromIdentifier(String identifier, FormatContext context) {
-        return fromIdentifier(identifier, context, FileFormat.class.getClassLoader())
-                .orElseThrow(
-                        () ->
-                                new RuntimeException(
-                                        String.format(
-                                                "Could not find a FileFormatFactory implementation class for %s format",
-                                                identifier)));
-    }
-
-    private static Optional<FileFormat> fromIdentifier(
-            String formatIdentifier, FormatContext context, ClassLoader classLoader) {
-        ServiceLoader<FileFormatFactory> serviceLoader =
-                ServiceLoader.load(FileFormatFactory.class, classLoader);
-        for (FileFormatFactory factory : serviceLoader) {
-            if (factory.identifier().equals(formatIdentifier.toLowerCase())) {
-                return Optional.of(factory.create(context));
-            }
+        if (identifier != null) {
+            identifier = identifier.toLowerCase();
         }
-
-        return Optional.empty();
+        FileFormatFactory fileFormatFactory =
+                FactoryUtil.discoverFactory(
+                        FileFormatFactory.class.getClassLoader(),
+                        FileFormatFactory.class,
+                        identifier);
+        return fileFormatFactory.create(context);
     }
 
     protected Options getIdentifierPrefixOptions(Options options) {

diff --git a/paimon-common/src/main/java/org/apache/paimon/format/FileFormatFactory.java b/paimon-common/src/main/java/org/apache/paimon/format/FileFormatFactory.java
@@ -19,13 +19,14 @@
 package org.apache.paimon.format;
 
 import org.apache.paimon.annotation.VisibleForTesting;
+import org.apache.paimon.factories.Factory;
 import org.apache.paimon.options.MemorySize;
 import org.apache.paimon.options.Options;
 
 import javax.annotation.Nullable;
 
 /** Factory to create {@link FileFormat}. */
-public interface FileFormatFactory {
+public interface FileFormatFactory extends Factory {
 
     String identifier();
 

diff --git a/paimon-common/src/main/java/org/apache/paimon/fs/ObjectCacheManager.java b/paimon-common/src/main/java/org/apache/paimon/fs/ObjectCacheManager.java
@@ -0,0 +1,57 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.paimon.fs;
+
+import org.apache.paimon.shade.caffeine2.com.github.benmanes.caffeine.cache.Cache;
+import org.apache.paimon.shade.caffeine2.com.github.benmanes.caffeine.cache.Caffeine;
+
+import java.time.Duration;
+import java.util.function.Function;
+
+/**
+ * Sample Object Cache Manager .
+ *
+ * @param <K>
+ * @param <V>
+ */
+public class ObjectCacheManager<K, V> {
+    private final Cache<K, V> cache;
+
+    private ObjectCacheManager(Duration timeout, int maxSize) {
+        this.cache = Caffeine.newBuilder().maximumSize(maxSize).expireAfterWrite(timeout).build();
+    }
+
+    public static <K, V> ObjectCacheManager<K, V> newObjectCacheManager(
+            Duration timeout, int maxSize) {
+        return new ObjectCacheManager<>(timeout, maxSize);
+    }
+
+    public ObjectCacheManager<K, V> put(K k, V v) {
+        this.cache.put(k, v);
+        return this;
+    }
+
+    public V get(K k, Function<? super K, ? extends V> creator) {
+        return this.cache.get(k, creator);
+    }
+
+    public V getIfPresent(K k) {
+        return this.cache.getIfPresent(k);
+    }
+}
diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/ManifestEntrySerializer.java b/paimon-core/src/main/java/org/apache/paimon/manifest/ManifestEntrySerializer.java
@@ -37,11 +37,18 @@ public class ManifestEntrySerializer extends VersionedObjectSerializer<ManifestE
 
     private final DataFileMetaSerializer dataFileMetaSerializer;
 
+    private static final ManifestEntrySerializer MANIFEST_ENTRY_SERIALIZER =
+            new ManifestEntrySerializer();
+
     public ManifestEntrySerializer() {
         super(ManifestEntry.SCHEMA);
         this.dataFileMetaSerializer = new DataFileMetaSerializer();
     }
 
+    public static ManifestEntrySerializer getInstance() {
+        return MANIFEST_ENTRY_SERIALIZER;
+    }
+
     @Override
     public int getVersion() {
         return 2;

diff --git a/paimon-core/src/main/java/org/apache/paimon/manifest/ManifestFile.java b/paimon-core/src/main/java/org/apache/paimon/manifest/ManifestFile.java
@@ -27,6 +27,7 @@
 import org.apache.paimon.fs.Path;
 import org.apache.paimon.io.RollingFileWriter;
 import org.apache.paimon.io.SingleFileWriter;
+import org.apache.paimon.predicate.Predicate;
 import org.apache.paimon.schema.SchemaManager;
 import org.apache.paimon.stats.SimpleStatsConverter;
 import org.apache.paimon.types.RowType;
@@ -197,14 +198,18 @@ public boolean isCacheEnabled() {
         }
 
         public ManifestFile create() {
+            return create(null);
+        }
+
+        public ManifestFile create(List<Predicate> filters) {
             RowType entryType = VersionedObjectSerializer.versionType(ManifestEntry.SCHEMA);
             return new ManifestFile(
                     fileIO,
                     schemaManager,
                     partitionType,
-                    new ManifestEntrySerializer(),
+                    ManifestEntrySerializer.getInstance(),
                     entryType,
-                    fileFormat.createReaderFactory(entryType),
+                    fileFormat.createReaderFactory(entryType, filters),
                     fileFormat.createWriterFactory(entryType),
                     compression,
                     pathFactory.manifestFileFactory(),

diff --git a/paimon-core/src/main/java/org/apache/paimon/operation/AbstractFileStoreScan.java b/paimon-core/src/main/java/org/apache/paimon/operation/AbstractFileStoreScan.java
@@ -35,9 +35,13 @@
 import org.apache.paimon.operation.metrics.ScanStats;
 import org.apache.paimon.partition.PartitionPredicate;
 import org.apache.paimon.predicate.Predicate;
+import org.apache.paimon.predicate.PredicateBuilder;
 import org.apache.paimon.schema.SchemaManager;
 import org.apache.paimon.schema.TableSchema;
 import org.apache.paimon.table.source.ScanMode;
+import org.apache.paimon.types.DataType;
+import org.apache.paimon.types.IntType;
+import org.apache.paimon.types.RowType;
 import org.apache.paimon.utils.BiFilter;
 import org.apache.paimon.utils.Filter;
 import org.apache.paimon.utils.Pair;
@@ -81,6 +85,7 @@ public abstract class AbstractFileStoreScan implements FileStoreScan {
 
     private Snapshot specifiedSnapshot = null;
     private Filter<Integer> bucketFilter = null;
+    private Collection<Integer> buckets;
     private BiFilter<Integer, Integer> totalAwareBucketFilter = null;
     private List<ManifestFileMeta> specifiedManifests = null;
     protected ScanMode scanMode = ScanMode.ALL;
@@ -128,6 +133,14 @@ public FileStoreScan withPartitionFilter(PartitionPredicate predicate) {
     @Override
     public FileStoreScan withBucket(int bucket) {
         this.bucketFilter = i -> i == bucket;
+        this.buckets = Collections.singletonList(bucket);
+        return this;
+    }
+
+    @Override
+    public FileStoreScan withBuckets(Collection<Integer> buckets) {
+        this.bucketFilter = buckets::contains;
+        this.buckets = buckets;
         return this;
     }
 
@@ -379,7 +392,7 @@ protected TableSchema scanTableSchema(long id) {
     public List<ManifestEntry> readManifest(ManifestFileMeta manifest) {
         List<ManifestEntry> entries =
                 manifestFileFactory
-                        .create()
+                        .create(createPushDownFilter(buckets))
                         .read(
                                 manifest.fileName(),
                                 manifest.fileSize(),
@@ -426,6 +439,22 @@ private Filter<InternalRow> createCacheRowFilter() {
         return row -> manifestCacheFilter.test(partitionGetter.apply(row), bucketGetter.apply(row));
     }
 
+    /**
+     * Read the corresponding entries based on the current required bucket, but push down into file
+     * format .
+     */
+    private static List<Predicate> createPushDownFilter(Collection<Integer> buckets) {
+        if (buckets == null || buckets.isEmpty()) {
+            return null;
+        }
+        List<Predicate> predicates = new ArrayList<>();
+        PredicateBuilder predicateBuilder =
+                new PredicateBuilder(
+                        RowType.of(new DataType[] {new IntType()}, new String[] {"_BUCKET"}));
+        predicates.add(predicateBuilder.in(0, new ArrayList<>(buckets)));
+        return predicates;
+    }
+
     /**
      * Read the corresponding entries based on the current required partition and bucket.
      *

diff --git a/paimon-core/src/main/java/org/apache/paimon/operation/FileStoreScan.java b/paimon-core/src/main/java/org/apache/paimon/operation/FileStoreScan.java
@@ -38,6 +38,7 @@
 import javax.annotation.Nullable;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Iterator;
 import java.util.LinkedHashMap;
 import java.util.List;
@@ -57,6 +58,8 @@ public interface FileStoreScan {
 
     FileStoreScan withBucket(int bucket);
 
+    FileStoreScan withBuckets(Collection<Integer> buckets);
+
     FileStoreScan withBucketFilter(Filter<Integer> bucketFilter);
 
     FileStoreScan withTotalAwareBucketFilter(BiFilter<Integer, Integer> bucketFilter);

diff --git a/paimon-core/src/main/java/org/apache/paimon/table/FallbackReadFileStoreTable.java b/paimon-core/src/main/java/org/apache/paimon/table/FallbackReadFileStoreTable.java
@@ -34,6 +34,7 @@
 import org.apache.paimon.table.source.DataSplit;
 import org.apache.paimon.table.source.DataTableScan;
 import org.apache.paimon.table.source.InnerTableRead;
+import org.apache.paimon.table.source.InnerTableScan;
 import org.apache.paimon.table.source.Split;
 import org.apache.paimon.table.source.TableRead;
 import org.apache.paimon.table.source.TableScan;
@@ -44,6 +45,7 @@
 
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
@@ -266,6 +268,13 @@ public Scan withBucketFilter(Filter<Integer> bucketFilter) {
             return this;
         }
 
+        @Override
+        public InnerTableScan withBuckets(Collection<Integer> buckets) {
+            mainScan.withBuckets(buckets);
+            fallbackScan.withBuckets(buckets);
+            return this;
+        }
+
         @Override
         public Scan withLevelFilter(Filter<Integer> levelFilter) {
             mainScan.withLevelFilter(levelFilter);

diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractDataTableScan.java b/paimon-core/src/main/java/org/apache/paimon/table/source/AbstractDataTableScan.java
@@ -48,6 +48,7 @@
 import org.apache.paimon.utils.Pair;
 import org.apache.paimon.utils.SnapshotManager;
 
+import java.util.Collection;
 import java.util.List;
 import java.util.Map;
 import java.util.Optional;
@@ -79,6 +80,12 @@ public AbstractDataTableScan withBucketFilter(Filter<Integer> bucketFilter) {
         return this;
     }
 
+    @Override
+    public AbstractDataTableScan withBuckets(Collection<Integer> buckets) {
+        snapshotReader.withBuckets(buckets);
+        return this;
+    }
+
     @Override
     public AbstractDataTableScan withPartitionFilter(Map<String, String> partitionSpec) {
         snapshotReader.withPartitionFilter(partitionSpec);

diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/InnerTableScan.java b/paimon-core/src/main/java/org/apache/paimon/table/source/InnerTableScan.java
@@ -23,6 +23,8 @@
 import org.apache.paimon.predicate.Predicate;
 import org.apache.paimon.utils.Filter;
 
+import java.util.Collection;
+import java.util.Collections;
 import java.util.List;
 import java.util.Map;
 
@@ -47,6 +49,15 @@ default InnerTableScan withBucketFilter(Filter<Integer> bucketFilter) {
         return this;
     }
 
+    default InnerTableScan withBucket(Integer bucket) {
+        return withBuckets(Collections.singletonList(bucket));
+    }
+
+    default InnerTableScan withBuckets(Collection<Integer> buckets) {
+        // return this is not safe for too many class not impl this method and withBucketFilter
+        return this;
+    }
+
     default InnerTableScan withLevelFilter(Filter<Integer> levelFilter) {
         return this;
     }

diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/snapshot/SnapshotReader.java b/paimon-core/src/main/java/org/apache/paimon/table/source/snapshot/SnapshotReader.java
@@ -39,6 +39,7 @@
 
 import javax.annotation.Nullable;
 
+import java.util.Collection;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
@@ -81,6 +82,8 @@ public interface SnapshotReader {
 
     SnapshotReader withBucket(int bucket);
 
+    SnapshotReader withBuckets(Collection<Integer> buckets);
+
     SnapshotReader withBucketFilter(Filter<Integer> bucketFilter);
 
     SnapshotReader withDataFileNameFilter(Filter<String> fileNameFilter);

diff --git a/paimon-core/src/main/java/org/apache/paimon/table/source/snapshot/SnapshotReaderImpl.java b/paimon-core/src/main/java/org/apache/paimon/table/source/snapshot/SnapshotReaderImpl.java
@@ -53,6 +53,7 @@
 import javax.annotation.Nullable;
 
 import java.util.ArrayList;
+import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
@@ -246,6 +247,11 @@ public SnapshotReader withBucket(int bucket) {
         return this;
     }
 
+    public SnapshotReader withBuckets(Collection<Integer> buckets) {
+        scan.withBuckets(buckets);
+        return this;
+    }
+
     @Override
     public SnapshotReader withBucketFilter(Filter<Integer> bucketFilter) {
         scan.withBucketFilter(bucketFilter);
@@ -272,7 +278,13 @@ public SnapshotReader withShard(int indexOfThisSubtask, int numberOfParallelSubt
                             Math.abs(file.hashCode() % numberOfParallelSubtasks)
                                     == indexOfThisSubtask);
         } else {
-            withBucketFilter(bucket -> bucket % numberOfParallelSubtasks == indexOfThisSubtask);
+            Set<Integer> buckets = new HashSet<>();
+            for (int bucket = 0; bucket < this.tableSchema.numBuckets(); bucket++) {
+                if (bucket % numberOfParallelSubtasks == indexOfThisSubtask) {
+                    buckets.add(bucket);
+                }
+            }
+            withBuckets(buckets);
         }
         return this;
     }