Merge pull request #436 from Azure/dev

rickle-msft · web-flow · commit 12d9633c87a7 · 2019-02-15T15:27:52.000-08:00
Dev
diff --git a/ChangeLog.txt b/ChangeLog.txt
@@ -1,3 +1,6 @@
+2019.02.15 Version 10.5.0
+* Added uploadFromNonReplayableFlowable to support uploading arbitrary data sources (like network streams) to a block blob.
+
 2019.01.11 Version 10.4.0
 * Fixed a bug that caused errors when java.io.tempdir has no trailing separator.
 * Upgrade autorest-clientruntime dependency to include some bug fixes.
diff --git a/README.md b/README.md
@@ -37,7 +37,7 @@ To get the binaries of this library as distributed by Microsoft, ready for use w
 <dependency>
 	<groupId>com.microsoft.azure</groupId>
 	<artifactId>azure-storage-blob</artifactId>
-	<version>10.4.0</version>
+	<version>10.5.0</version>
 </dependency>
 ```
 
diff --git a/pom.xml b/pom.xml
@@ -14,7 +14,7 @@
 
     <groupId>com.microsoft.azure</groupId>
     <artifactId>azure-storage-blob</artifactId>
-    <version>10.4.0</version>
+    <version>10.5.0</version>
 
     <name>Azure Storage Blob</name>
     <description>The Azure Storage Java Blob library.</description>
diff --git a/src/main/java/com/microsoft/azure/storage/blob/Constants.java b/src/main/java/com/microsoft/azure/storage/blob/Constants.java
@@ -184,7 +184,7 @@ static final class HeaderConstants {
         /**
          * Specifies the value to use for UserAgent header.
          */
-        static final String USER_AGENT_VERSION = "10.4.0";
+        static final String USER_AGENT_VERSION = "10.5.0";
 
         private HeaderConstants() {
             // Private to prevent construction.
diff --git a/src/main/java/com/microsoft/azure/storage/blob/TransferManager.java b/src/main/java/com/microsoft/azure/storage/blob/TransferManager.java
@@ -16,6 +16,7 @@
 package com.microsoft.azure.storage.blob;
 
 import com.microsoft.azure.storage.blob.models.BlobDownloadHeaders;
+import com.microsoft.azure.storage.blob.models.BlockBlobCommitBlockListResponse;
 import com.microsoft.azure.storage.blob.models.ModifiedAccessConditions;
 import com.microsoft.rest.v2.util.FlowableUtil;
 import io.reactivex.Flowable;
@@ -286,4 +287,135 @@ private static Single<List<Object>> getSetupSingle(BlobURL blobURL, BlobRange r,
             return Single.just(Arrays.asList(r.count(), o.accessConditions()));
         }
     }
+
+    /**
+     * Uploads the contents of an arbitrary {@code Flowable} to a block blob. This Flowable need not be replayable and
+     * therefore it may have as its source a network stream or any other data for which the replay behavior is unknown
+     * (non-replayable meaning the Flowable may not return the exact same data on each subscription).
+     *
+     * To eliminate the need for replayability on the source, the client must perform some buffering in order to ensure
+     * the actual data passed to the network is replayable. This is important in order to support retries, which are
+     * crucial for reliable data transfer. Typically, the greater the number of buffers used, the greater the possible
+     * parallelism. Larger buffers means we will have to stage fewer blocks. The tradeoffs between these values are
+     * context-dependent, so some experimentation may be required to optimize inputs for a given scenario.
+     *
+     * Note that buffering must be strictly sequential. Only the upload portion of this operation may be parallelized;
+     * the reads cannot be. Therefore, this method is not as optimal as
+     * {@link #uploadFileToBlockBlob(AsynchronousFileChannel, BlockBlobURL, int, TransferManagerUploadToBlockBlobOptions)}
+     * and if the source is known to be a file, that method should be preferred.
+     *
+     * @param source
+     *         Contains the data to upload. Unlike other upload methods in this library, this method does not require
+     *         that the Flowable be replayable.
+     * @param blockBlobURL
+     *         Points to the blob to which the data should be uploaded.
+     * @param blockSize
+     *         The size of each block that will be staged. This value also determines the size that each buffer used by
+     *         this method will be and determines the number of requests that need to be made. The amount of memory
+     *         consumed by this method may be up to blockSize * numBuffers. If block size is large, this method will
+     *         make fewer network calls, but each individual call will send more data and will therefore take longer.
+     * @param numBuffers
+     *         The maximum number of buffers this method should allocate. Must be at least two. Generally this value
+     *         should have some relationship to the value for parallelism passed via the options. If the number of
+     *         available buffers is smaller than the level of parallelism, then this method will not be able to make
+     *         full use of the available parallelism. It is unlikely that the value need be more than two times the
+     *         level of parallelism as such a value means that (assuming buffering is fast enough) there are enough
+     *         available buffers to have both one occupied for each worker and one ready for all workers should they
+     *         all complete the current request at approximately the same time. The amount of memory consumed by this
+     *         method may be up to blockSize * numBuffers.
+     * @param options
+     *         {@link TransferManagerUploadToBlockBlobOptions}
+     * @return Emits the successful response.
+     *
+     * @apiNote ## Sample Code \n
+     * [!code-java[Sample_Code](../azure-storage-java/src/test/java/com/microsoft/azure/storage/Samples.java?name=tm_nrf "Sample code for TransferManager.uploadFromNonReplayableFlowable")] \n
+     * For more samples, please see the [Samples file](%https://github.com/Azure/azure-storage-java/blob/master/src/test/java/com/microsoft/azure/storage/Samples.java)
+     */
+    public static Single<BlockBlobCommitBlockListResponse> uploadFromNonReplayableFlowable(
+            final Flowable<ByteBuffer> source, final BlockBlobURL blockBlobURL, final int blockSize,
+            final int numBuffers, final TransferManagerUploadToBlockBlobOptions options) {
+        Utility.assertNotNull("source", source);
+        Utility.assertNotNull("blockBlobURL", blockBlobURL);
+
+        TransferManagerUploadToBlockBlobOptions optionsReal = options == null ?
+                TransferManagerUploadToBlockBlobOptions.DEFAULT : options;
+
+        // See ProgressReporter for an explanation on why this lock is necessary and why we use AtomicLong.
+        AtomicLong totalProgress = new AtomicLong(0);
+        Lock progressLock = new ReentrantLock();
+
+        // Validation done in the constructor.
+        UploadFromNRFBufferPool pool = new UploadFromNRFBufferPool(numBuffers, blockSize);
+
+        /*
+        Break the source flowable into chunks that are <= chunk size. This makes filling the pooled buffers much easier
+        as we can guarantee we only need at most two buffers for any call to write (two in the case of one pool buffer
+        filling up with more data to write)
+         */
+        Flowable<ByteBuffer> chunkedSource = source.flatMap(buffer -> {
+            if (buffer.remaining() <= blockSize) {
+                return Flowable.just(buffer);
+            }
+            List<ByteBuffer> smallerChunks = new ArrayList<>();
+            for (int i=0; i < Math.ceil(buffer.remaining() / (double)blockSize); i++) {
+                // Note that duplicate does not duplicate data. It simply creates a duplicate view of the data.
+                ByteBuffer duplicate = buffer.duplicate();
+                duplicate.position(i * blockSize);
+                duplicate.limit(Math.min(duplicate.limit(), (i+1) * blockSize));
+                smallerChunks.add(duplicate);
+            }
+            return Flowable.fromIterable(smallerChunks);
+        }, false, 1);
+
+        /*
+        Write each buffer from the chunkedSource to the pool and call flush at the end to get the last bits.
+         */
+        return chunkedSource.flatMap(pool::write, false, 1)
+                .concatWith(Flowable.defer(pool::flush))
+                .concatMapEager(buffer -> {
+                    // Report progress as necessary.
+                    Flowable<ByteBuffer> data = ProgressReporter.addParallelProgressReporting(Flowable.just(buffer),
+                            optionsReal.progressReceiver(), progressLock, totalProgress);
+
+                    final String blockId = Base64.getEncoder().encodeToString(
+                            UUID.randomUUID().toString().getBytes());
+
+                    /*
+                    Make a call to stageBlock. Instead of emitting the response, which we don't care about other
+                    than that it was successful, emit the blockId for this request. These will be collected below.
+                    Turn that into an Observable which emits one item to comply with the signature of
+                    concatMapEager.
+                     */
+                    return blockBlobURL.stageBlock(blockId, data,
+                            buffer.remaining(), optionsReal.accessConditions().leaseAccessConditions(), null)
+                            .map(x -> {
+                                pool.returnBuffer(buffer);
+                                return blockId;
+                            }).toFlowable();
+
+                    /*
+                    Specify the number of concurrent subscribers to this map. This determines how many concurrent
+                    rest calls are made. This is so because maxConcurrency is the number of internal subscribers
+                    available to subscribe to the Observables emitted by the source. A subscriber is not released
+                    for a new subscription until its Observable calls onComplete, which here means that the call to
+                    stageBlock is finished. Prefetch is a hint that each of the Observables emitted by the source
+                    will emit only one value, which is true here because we have converted from a Single.
+                     */
+                }, optionsReal.parallelism(), 1)
+                /*
+                collectInto will gather each of the emitted blockIds into a list. Because we used concatMap, the Ids
+                will be emitted according to their block number, which means the list generated here will be
+                properly ordered. This also converts into a Single.
+                 */
+                .collectInto(new ArrayList<String>(), ArrayList::add)
+                /*
+                collectInto will not emit the list until its source calls onComplete. This means that by the time we
+                call stageBlock list, all of the stageBlock calls will have finished. By flatMapping the list, we
+                can "map" it into a call to commitBlockList.
+                */
+                .flatMap(ids ->
+                        blockBlobURL.commitBlockList(ids, optionsReal.httpHeaders(), optionsReal.metadata(),
+                                optionsReal.accessConditions(), null));
+
+    }
 }
diff --git a/src/main/java/com/microsoft/azure/storage/blob/UploadFromNRFBufferPool.java b/src/main/java/com/microsoft/azure/storage/blob/UploadFromNRFBufferPool.java
@@ -0,0 +1,174 @@
+/*
+ * Copyright Microsoft Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package com.microsoft.azure.storage.blob;
+
+import io.reactivex.Flowable;
+
+import java.nio.ByteBuffer;
+import java.util.concurrent.BlockingQueue;
+import java.util.concurrent.LinkedBlockingQueue;
+import java.util.concurrent.locks.Lock;
+import java.util.concurrent.locks.ReentrantLock;
+
+/**
+ * This type is to support the implementation of uploadFromNonReplaybleFlowable only. It is mandatory that the caller
+ * has broken the source into ByteBuffers that are no greater than the size of a chunk and therefore a buffer in the
+ * pool. This is necessary because it upper bounds the number of buffers we need for a given call to write() to 2. If
+ * the size of ByteBuffer passed into write() were unbounded, the pool could stall as it would run out of buffers before
+ * it is able to return a result, and if it is unable to return, no data can be uploaded and therefore no pools
+ * returned.
+ *
+ * It is incumbent upon the caller to return the buffers after an upload is completed. It is also the caller's
+ * responsibility to signal to the pool when the stream is empty and call flush to return any data still sitting in
+ * the pool.
+ *
+ * Broadly, the workflow of this operation is to chunk the source into reasonable sized pieces. On each piece, one
+ * thread will call write on the pool. The pool will grab a buffer from the queue to write to, possibly waiting for one
+ * to be available, and either store the incomplete buffer to be filled on the next write or return the buffer to be
+ * sent. Filled buffers can be uploaded in parallel and should return buffers to the pool after the upload completes.
+ * Once the source terminates, it should call flush.
+ */
+final class UploadFromNRFBufferPool {
+
+    private final BlockingQueue<ByteBuffer> buffers;
+
+    private final int maxBuffs;
+
+    private int numBuffs = 0;
+
+    private final int buffSize;
+
+    private ByteBuffer currentBuf;
+
+    UploadFromNRFBufferPool(final int numBuffs, final int buffSize) {
+        /*
+        We require at least two buffers because it is possible that a given write will spill over into a second buffer.
+        We only need one overflow buffer because the max size of a ByteBuffer is assumed to be the size as a buffer in
+        the pool.
+         */
+        Utility.assertInBounds("numBuffs", numBuffs, 2, Integer.MAX_VALUE);
+        this.maxBuffs = numBuffs;
+        buffers = new LinkedBlockingQueue<>(numBuffs);
+
+
+        //These buffers will be used in calls to stageBlock, so they must be no greater than block size.
+        Utility.assertInBounds("buffSize", buffSize, 1, BlockBlobURL.MAX_STAGE_BLOCK_BYTES);
+        this.buffSize = buffSize;
+
+        //We prep the queue with two buffers in case there is overflow.
+        buffers.add(ByteBuffer.allocate(this.buffSize));
+        buffers.add(ByteBuffer.allocate(this.buffSize));
+        this.numBuffs = 2;
+    }
+
+    public Flowable<ByteBuffer> write(ByteBuffer buf) {
+        // Check if there's a buffer holding any data from a previous call to write. If not, get a new one.
+        if (this.currentBuf == null) {
+            this.currentBuf = this.getBuffer();
+        }
+
+        Flowable<ByteBuffer> result;
+        // We can fit this whole write in the buffer we currently have.
+        if (this.currentBuf.remaining() >= buf.remaining()) {
+            this.currentBuf.put(buf);
+            if (this.currentBuf.remaining() == 0) {
+                // Reset the position so that we can read the whole thing then return this buffer.
+                this.currentBuf.position(0);
+                result = Flowable.just(this.currentBuf);
+                // This will force us to get a new buffer next time we try to write.
+                this.currentBuf = null;
+            }
+            else {
+                /*
+                We are still filling the current buffer, so we have no data to return. We will return the buffer once it
+                is filled
+                 */
+                result = Flowable.empty();
+            }
+        }
+        // We will overflow the current buffer and require another one.
+        else {
+            // Adjust the window of buf so that we fill up currentBuf without going out of bounds.
+            int oldLimit = buf.limit();
+            buf.limit(buf.position() + this.currentBuf.remaining());
+            this.currentBuf.put(buf);
+            // Set the old limit so we can read to the end in the next buffer.
+            buf.limit(oldLimit);
+
+            // Reset the position so we can read the buffer.
+            this.currentBuf.position(0);
+            result =  Flowable.just(this.currentBuf);
+
+            /*
+            Get a new buffer and fill it with whatever is left from buf. Note that this relies on the assumption that
+            the source Flowable has been split up into buffers that are no bigger than chunk size. This assumption
+            means we'll only have to over flow once, and the buffer we overflow into will not be filled. This is the
+            buffer we will write to on the next call to write().
+             */
+            this.currentBuf = this.getBuffer();
+            this.currentBuf.put(buf);
+        }
+        return result;
+    }
+
+    private ByteBuffer getBuffer() {
+        ByteBuffer result;
+        // There are no buffers in the queue and we have space to allocate one.
+        if (this.buffers.isEmpty() && this.numBuffs < this.maxBuffs) {
+            result = ByteBuffer.allocate(this.buffSize);
+            this.numBuffs++;
+        }
+        else {
+            try {
+                // If empty, this will wait for an upload to finish and return a buffer.
+                result = this.buffers.take();
+
+            } catch (InterruptedException e) {
+                throw new IllegalStateException("UploadFromStream thread interrupted." + " Thread:" +
+                        Thread.currentThread().getId());
+            }
+        }
+        return result;
+    }
+
+    Flowable<ByteBuffer> flush() {
+        /*
+        Prep and return any data left in the pool. It is important to set the limit so that we don't read beyond the
+        actual data as this buffer may have been used before and therefore may have some garbage at the end.
+         */
+        if (this.currentBuf != null) {
+            this.currentBuf.flip();
+            ByteBuffer last = this.currentBuf;
+            // If there is an accidental duplicate call to flush, this prevents sending the last buffer twice
+            this.currentBuf = null;
+            return Flowable.just(last);
+        }
+        return Flowable.empty();
+    }
+
+    void returnBuffer(ByteBuffer b) {
+        // Reset the buffer.
+        b.position(0);
+        b.limit(b.capacity());
+
+        try {
+            this.buffers.put(b);
+        }
+        catch (InterruptedException e) {
+            throw new IllegalStateException("UploadFromStream thread interrupted.");
+        }
+    }
+}
diff --git a/src/test/java/com/microsoft/azure/storage/Samples.java b/src/test/java/com/microsoft/azure/storage/Samples.java
diff --git a/src/test/java/com/microsoft/azure/storage/TransferManagerTest.groovy b/src/test/java/com/microsoft/azure/storage/TransferManagerTest.groovy