diff --git a/doc/release_notes.md b/doc/release_notes.md
index 1cee78cb5..a3f26c16e 100644
--- a/doc/release_notes.md
+++ b/doc/release_notes.md
@@ -261,7 +261,10 @@ OpenPose Library - Release Notes
 ## Current version - future OpenPose 1.5.0
 1. Main improvements:
     1. Added initial single-person tracker for further speed up or visual smoothing (`--tracking` flag).
-    2. Greedy body part connector implemented in CUDA: +~30% speed up in Nvidia (CUDA) version with default flags and +~10% in maximum accuracy configuration. In addition, it provides a small 0.5% boost in accuracy (default flags).
+    2. Speed up of the CUDA functions of OpenPose:
+        1. Greedy body part connector implemented in CUDA: +~30% speedup in Nvidia (CUDA) version with default flags and +~10% in maximum accuracy configuration. In addition, it provides a small 0.5% boost in accuracy (default flags).
+        2. +5-30% additional speedup for the body part connector of point 1.
+        3. 2-4x speedup for NMS.
     3. Unity binding of OpenPose released. OpenPose adds the flag `BUILD_UNITY_SUPPORT` on CMake, which enables special Unity code so it can be built as a Unity plugin.
     4. If camera is unplugged, OpenPose GUI and command line will display a warning and try to reconnect it.
     5. Wrapper classes simplified and renamed. Wrapper renamed as WrapperT, and created Wrapper as the non-templated class equivalent.
diff --git a/doc/speed_up_openpose.md b/doc/speed_up_openpose.md
index de86606d0..fe7c7d571 100644
--- a/doc/speed_up_openpose.md
+++ b/doc/speed_up_openpose.md
@@ -39,6 +39,7 @@ Some speed tips to maximize the OpenPose runtime speed while preserving the accu
     2. Change GPU rendering by CPU rendering to get approximately +0.5 FPS (`--render_pose 1`).
     3. Use cuDNN 5.1 or 7.2 (cuDNN 6 is ~10% slower).
     4. Use the `BODY_25` model for simultaneously maximum speed and accuracy (both COCO and MPII models are slower and less accurate). But it does increase the GPU memory, so it might go out of memory more easily in low-memory GPUs.
+    5. Enable the AVX flag in CMake-GUI (if your computer supports it).
 
 
 
diff --git a/include/openpose/utilities/profiler.hpp b/include/openpose/utilities/profiler.hpp
index c44ae90ac..4818f7330 100644
--- a/include/openpose/utilities/profiler.hpp
+++ b/include/openpose/utilities/profiler.hpp
@@ -60,6 +60,7 @@ namespace op
         } \
         cudaDeviceSynchronize(); \
         (finalTime) = (factor)/(float)(REPS)*getTimeSeconds(timerInit); \
+        cudaCheck(__LINE__, __FUNCTION__, __FILE__); \
     }
 
     // Enable PROFILER_ENABLED on Makefile.config or CMake in order to use this function. Otherwise nothing will be outputted.
diff --git a/src/openpose/net/bodyPartConnectorBase.cpp b/src/openpose/net/bodyPartConnectorBase.cpp
index 71ed89f26..b6a570c27 100644
--- a/src/openpose/net/bodyPartConnectorBase.cpp
+++ b/src/openpose/net/bodyPartConnectorBase.cpp
@@ -1,3 +1,4 @@
+#include <set>
 #include <openpose/utilities/check.hpp>
 #include <openpose/utilities/fastMath.hpp>
 #include <openpose/pose/poseParameters.hpp>
@@ -459,6 +460,7 @@ namespace op
             const auto peaksOffset = (maxPeaks+1);
             // Save which body parts have been already assigned
             std::vector<int> personAssigned(numberBodyParts*maxPeaks, -1);
+            std::set<int, std::greater<int>> indexesToRemoveSortedSet;
             // Iterate over each PAF pair connection detected
             // E.g., neck1-nose2, neck5-Lshoulder0, etc.
             for (const auto& pairConnection : pairConnections)
@@ -592,18 +594,23 @@ namespace op
                         // Update score
                         peopleVector[assigned1].second += peopleVector[assigned2].second + pafScore;
                         // Erase the non-merged person
-                        peopleVector.erase(peopleVector.begin()+assigned2);
+                        // peopleVector.erase(peopleVector.begin()+assigned2); // x2 slower when removing on-the-fly
+                        indexesToRemoveSortedSet.emplace(assigned2); // Add into set so we can remove them all at once
                         // Update associated personAssigned (person indexes have changed)
                         for (auto& element : personAssigned)
                         {
                             if (element == assigned2)
                                 element = assigned1;
-                            else if (element > assigned2)
-                                element--;
+                            // No need because I will only remove them at the very end
+                            // else if (element > assigned2)
+                            //     element--;
                         }
                     }
                 }
             }
+            // Remove unused people
+            for (const auto& index : indexesToRemoveSortedSet)
+                peopleVector.erase(peopleVector.begin()+index);
             // Return result
             return peopleVector;
         }
@@ -685,7 +692,7 @@ namespace op
                 poseKeypoints.reset();
                 poseScores.reset();
             }
-            const auto numberBodyPartsAndPAFs = numberBodyParts + numberBodyPartPairs;
+            const auto oneOverNumberBodyPartsAndPAFs = 1/T(numberBodyParts + numberBodyPartPairs);
             for (auto person = 0u ; person < validSubsetIndexes.size() ; person++)
             {
                 const auto& personPair = peopleVector[validSubsetIndexes[person]];
@@ -701,7 +708,7 @@ namespace op
                         poseKeypoints[baseOffset + 2] = peaksPtr[bodyPartIndex];
                     }
                 }
-                poseScores[person] = personPair.second / T(numberBodyPartsAndPAFs);
+                poseScores[person] = personPair.second * oneOverNumberBodyPartsAndPAFs;
             }
         }
         catch (const std::exception& e)
diff --git a/src/openpose/net/bodyPartConnectorBase.cu b/src/openpose/net/bodyPartConnectorBase.cu
index 025cac553..4e3d172b8 100644
--- a/src/openpose/net/bodyPartConnectorBase.cu
+++ b/src/openpose/net/bodyPartConnectorBase.cu
@@ -5,8 +5,6 @@
 
 namespace op
 {
-    const dim3 THREADS_PER_BLOCK{4, 16, 16};
-
     template<typename T>
     inline __device__ int intRoundGPU(const T a)
     {
@@ -14,9 +12,9 @@ namespace op
     }
 
     template <typename T>
-    inline __device__  T process(const T* bodyPartA, const T* bodyPartB, const T* mapX, const T* mapY,
-                                 const int heatmapWidth, const int heatmapHeight, const T interThreshold,
-                                 const T interMinAboveThreshold)
+    inline __device__  T process(
+        const T* bodyPartA, const T* bodyPartB, const T* mapX, const T* mapY, const int heatmapWidth,
+        const int heatmapHeight, const T interThreshold, const T interMinAboveThreshold)
     {
         const auto vectorAToBX = bodyPartB[0] - bodyPartA[0];
         const auto vectorAToBY = bodyPartB[1] - bodyPartA[1];
@@ -67,18 +65,56 @@ namespace op
         return -1;
     }
 
+    // template <typename T>
+    // __global__ void pafScoreKernelOld(
+    //     T* pairScoresPtr, const T* const heatMapPtr, const T* const peaksPtr, const unsigned int* const bodyPartPairsPtr,
+    //     const unsigned int* const mapIdxPtr, const unsigned int maxPeaks, const int numberBodyPartPairs,
+    //     const int heatmapWidth, const int heatmapHeight, const T interThreshold, const T interMinAboveThreshold)
+    // {
+    //     const auto pairIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
+    //     const auto peakA = (blockIdx.y * blockDim.y) + threadIdx.y;
+    //     const auto peakB = (blockIdx.z * blockDim.z) + threadIdx.z;
+
+    //     if (pairIndex < numberBodyPartPairs && peakA < maxPeaks && peakB < maxPeaks)
+    //     {
+    //         const auto baseIndex = 2*pairIndex;
+    //         const auto partA = bodyPartPairsPtr[baseIndex];
+    //         const auto partB = bodyPartPairsPtr[baseIndex + 1];
+
+    //         const T numberPeaksA = peaksPtr[3*partA*(maxPeaks+1)];
+    //         const T numberPeaksB = peaksPtr[3*partB*(maxPeaks+1)];
+
+    //         const auto outputIndex = (pairIndex*maxPeaks+peakA)*maxPeaks + peakB;
+    //         if (peakA < numberPeaksA && peakB < numberPeaksB)
+    //         {
+    //             const auto mapIdxX = mapIdxPtr[baseIndex];
+    //             const auto mapIdxY = mapIdxPtr[baseIndex + 1];
+
+    //             const T* const bodyPartA = peaksPtr + (3*(partA*(maxPeaks+1) + peakA+1));
+    //             const T* const bodyPartB = peaksPtr + (3*(partB*(maxPeaks+1) + peakB+1));
+    //             const T* const mapX = heatMapPtr + mapIdxX*heatmapWidth*heatmapHeight;
+    //             const T* const mapY = heatMapPtr + mapIdxY*heatmapWidth*heatmapHeight;
+    //             pairScoresPtr[outputIndex] = process(
+    //                 bodyPartA, bodyPartB, mapX, mapY, heatmapWidth, heatmapHeight, interThreshold,
+    //                 interMinAboveThreshold);
+    //         }
+    //         else
+    //             pairScoresPtr[outputIndex] = -1;
+    //     }
+    // }
+
     template <typename T>
-    __global__ void pafScoreKernel(T* pairScoresPtr, const T* const heatMapPtr, const T* const peaksPtr,
-                                   const unsigned int* const bodyPartPairsPtr, const unsigned int* const mapIdxPtr,
-                                   const unsigned int maxPeaks, const int numberBodyPartPairs,
-                                   const int heatmapWidth, const int heatmapHeight, const T interThreshold,
-                                   const T interMinAboveThreshold)
+    __global__ void pafScoreKernel(
+        T* pairScoresPtr, const T* const heatMapPtr, const T* const peaksPtr, const unsigned int* const bodyPartPairsPtr,
+        const unsigned int* const mapIdxPtr, const unsigned int maxPeaks, const int numberBodyPartPairs,
+        const int heatmapWidth, const int heatmapHeight, const T interThreshold, const T interMinAboveThreshold)
     {
-        const auto pairIndex = (blockIdx.x * blockDim.x) + threadIdx.x;
+        const auto peakB = (blockIdx.x * blockDim.x) + threadIdx.x;
         const auto peakA = (blockIdx.y * blockDim.y) + threadIdx.y;
-        const auto peakB = (blockIdx.z * blockDim.z) + threadIdx.z;
+        const auto pairIndex = (blockIdx.z * blockDim.z) + threadIdx.z;
 
-        if (pairIndex < numberBodyPartPairs && peakA < maxPeaks && peakB < maxPeaks)
+        if (peakA < maxPeaks && peakB < maxPeaks)
+        // if (pairIndex < numberBodyPartPairs && peakA < maxPeaks && peakB < maxPeaks)
         {
             const auto baseIndex = 2*pairIndex;
             const auto partA = bodyPartPairsPtr[baseIndex];
@@ -106,6 +142,176 @@ namespace op
         }
     }
 
+    // template <typename T>
+    // std::vector<std::pair<std::vector<int>, T>> pafVectorIntoPeopleVectorOld(
+    //     const std::vector<std::tuple<T, T, int, int, int>>& pairConnections, const T* const peaksPtr,
+    //     const int maxPeaks, const std::vector<unsigned int>& bodyPartPairs, const unsigned int numberBodyParts)
+    // {
+    //     try
+    //     {
+    //         // std::vector<std::pair<std::vector<int>, double>> refers to:
+    //         //     - std::vector<int>: [body parts locations, #body parts found]
+    //         //     - double: person subset score
+    //         std::vector<std::pair<std::vector<int>, T>> peopleVector;
+    //         const auto vectorSize = numberBodyParts+1;
+    //         const auto peaksOffset = (maxPeaks+1);
+    //         // Save which body parts have been already assigned
+    //         std::vector<int> personAssigned(numberBodyParts*maxPeaks, -1);
+    //         // Iterate over each PAF pair connection detected
+    //         // E.g., neck1-nose2, neck5-Lshoulder0, etc.
+    //         for (const auto& pairConnection : pairConnections)
+    //         {
+    //             // Read pairConnection
+    //             // // Total score - only required for previous sort
+    //             // const auto totalScore = std::get<0>(pairConnection);
+    //             const auto pafScore = std::get<1>(pairConnection);
+    //             const auto pairIndex = std::get<2>(pairConnection);
+    //             const auto indexA = std::get<3>(pairConnection);
+    //             const auto indexB = std::get<4>(pairConnection);
+    //             // Derived data
+    //             const auto bodyPartA = bodyPartPairs[2*pairIndex];
+    //             const auto bodyPartB = bodyPartPairs[2*pairIndex+1];
+
+    //             const auto indexScoreA = (bodyPartA*peaksOffset + indexA)*3 + 2;
+    //             const auto indexScoreB = (bodyPartB*peaksOffset + indexB)*3 + 2;
+    //             // -1 because indexA and indexB are 1-based
+    //             auto& aAssigned = personAssigned[bodyPartA*maxPeaks+indexA-1];
+    //             auto& bAssigned = personAssigned[bodyPartB*maxPeaks+indexB-1];
+    //             // Debugging
+    //             #ifdef DEBUG
+    //                 if (indexA-1 > peaksOffset || indexA <= 0)
+    //                     error("Something is wrong: " + std::to_string(indexA)
+    //                           + " vs. " + std::to_string(peaksOffset) + ". Contact us.",
+    //                           __LINE__, __FUNCTION__, __FILE__);
+    //                 if (indexB-1 > peaksOffset || indexB <= 0)
+    //                     error("Something is wrong: " + std::to_string(indexB)
+    //                           + " vs. " + std::to_string(peaksOffset) + ". Contact us.",
+    //                           __LINE__, __FUNCTION__, __FILE__);
+    //             #endif
+
+    //             // Different cases:
+    //             //     1. A & B not assigned yet: Create new person
+    //             //     2. A assigned but not B: Add B to person with A (if no another B there)
+    //             //     3. B assigned but not A: Add A to person with B (if no another A there)
+    //             //     4. A & B already assigned to same person (circular/redundant PAF): Update person score
+    //             //     5. A & B already assigned to different people: Merge people if keypoint intersection is null
+    //             // 1. A & B not assigned yet: Create new person
+    //             if (aAssigned < 0 && bAssigned < 0)
+    //             {
+    //                 // Keypoint indexes
+    //                 std::vector<int> rowVector(vectorSize, 0);
+    //                 rowVector[bodyPartA] = indexScoreA;
+    //                 rowVector[bodyPartB] = indexScoreB;
+    //                 // Number keypoints
+    //                 rowVector.back() = 2;
+    //                 // Score
+    //                 const auto personScore = peaksPtr[indexScoreA] + peaksPtr[indexScoreB] + pafScore;
+    //                 // Set associated personAssigned as assigned
+    //                 aAssigned = (int)peopleVector.size();
+    //                 bAssigned = aAssigned;
+    //                 // Create new personVector
+    //                 peopleVector.emplace_back(std::make_pair(rowVector, personScore));
+    //             }
+    //             // 2. A assigned but not B: Add B to person with A (if no another B there)
+    //             // or
+    //             // 3. B assigned but not A: Add A to person with B (if no another A there)
+    //             else if ((aAssigned >= 0 && bAssigned < 0)
+    //                 || (aAssigned < 0 && bAssigned >= 0))
+    //             {
+    //                 // Assign person1 to one where xAssigned >= 0
+    //                 const auto assigned1 = (aAssigned >= 0 ? aAssigned : bAssigned);
+    //                 auto& assigned2 = (aAssigned >= 0 ? bAssigned : aAssigned);
+    //                 const auto bodyPart2 = (aAssigned >= 0 ? bodyPartB : bodyPartA);
+    //                 const auto indexScore2 = (aAssigned >= 0 ? indexScoreB : indexScoreA);
+    //                 // Person index
+    //                 auto& personVector = peopleVector[assigned1];
+    //                 // Debugging
+    //                 #ifdef DEBUG
+    //                     const auto bodyPart1 = (aAssigned >= 0 ? bodyPartA : bodyPartB);
+    //                     const auto indexScore1 = (aAssigned >= 0 ? indexScoreA : indexScoreB);
+    //                     const auto index1 = (aAssigned >= 0 ? indexA : indexB);
+    //                     if ((unsigned int)personVector.first.at(bodyPart1) != indexScore1)
+    //                         error("Something is wrong: "
+    //                               + std::to_string((personVector.first[bodyPart1]-2)/3-bodyPart1*peaksOffset)
+    //                               + " vs. " + std::to_string((indexScore1-2)/3-bodyPart1*peaksOffset) + " vs. "
+    //                               + std::to_string(index1) + ". Contact us.",
+    //                               __LINE__, __FUNCTION__, __FILE__);
+    //                 #endif
+    //                 // If person with 1 does not have a 2 yet
+    //                 if (personVector.first[bodyPart2] == 0)
+    //                 {
+    //                     // Update keypoint indexes
+    //                     personVector.first[bodyPart2] = indexScore2;
+    //                     // Update number keypoints
+    //                     personVector.first.back()++;
+    //                     // Update score
+    //                     personVector.second += peaksPtr[indexScore2] + pafScore;
+    //                     // Set associated personAssigned as assigned
+    //                     assigned2 = assigned1;
+    //                 }
+    //                 // Otherwise, ignore this B because the previous one came from a higher PAF-confident score
+    //             }
+    //             // 4. A & B already assigned to same person (circular/redundant PAF): Update person score
+    //             else if (aAssigned >=0 && bAssigned >=0 && aAssigned == bAssigned)
+    //                 peopleVector[aAssigned].second += pafScore;
+    //             // 5. A & B already assigned to different people: Merge people if keypoint intersection is null
+    //             // I.e., that the keypoints in person A and B do not overlap
+    //             else if (aAssigned >=0 && bAssigned >=0 && aAssigned != bAssigned)
+    //             {
+    //                 // Assign person1 to the one with lowest index for 2 reasons:
+    //                 //     1. Speed up: Removing an element from std::vector is cheaper for latest elements
+    //                 //     2. Avoid harder index update: Updated elements in person1ssigned would depend on
+    //                 //        whether person1 > person2 or not: element = aAssigned - (person2 > person1 ? 1 : 0)
+    //                 const auto assigned1 = (aAssigned < bAssigned ? aAssigned : bAssigned);
+    //                 const auto assigned2 = (aAssigned < bAssigned ? bAssigned : aAssigned);
+    //                 auto& person1 = peopleVector[assigned1].first;
+    //                 const auto& person2 = peopleVector[assigned2].first;
+    //                 // Check if complementary
+    //                 // Defining found keypoint indexes in personA as kA, and analogously kB
+    //                 // Complementary if and only if kA intersection kB = empty. I.e., no common keypoints
+    //                 bool complementary = true;
+    //                 for (auto part = 0u ; part < numberBodyParts ; part++)
+    //                 {
+    //                     if (person1[part] > 0 && person2[part] > 0)
+    //                     {
+    //                         complementary = false;
+    //                         break;
+    //                     }
+    //                 }
+    //                 // If complementary, merge both people into 1
+    //                 if (complementary)
+    //                 {
+    //                     // Update keypoint indexes
+    //                     for (auto part = 0u ; part < numberBodyParts ; part++)
+    //                         if (person1[part] == 0)
+    //                             person1[part] = person2[part];
+    //                     // Update number keypoints
+    //                     person1.back() += person2.back();
+    //                     // Update score
+    //                     peopleVector[assigned1].second += peopleVector[assigned2].second + pafScore;
+    //                     // Erase the non-merged person
+    //                     peopleVector.erase(peopleVector.begin()+assigned2);
+    //                     // Update associated personAssigned (person indexes have changed)
+    //                     for (auto& element : personAssigned)
+    //                     {
+    //                         if (element == assigned2)
+    //                             element = assigned1;
+    //                         else if (element > assigned2)
+    //                             element--;
+    //                     }
+    //                 }
+    //             }
+    //         }
+    //         // Return result
+    //         return peopleVector;
+    //     }
+    //     catch (const std::exception& e)
+    //     {
+    //         error(e.what(), __LINE__, __FUNCTION__, __FILE__);
+    //         return {};
+    //     }
+    // }
+
     template <typename T>
     void connectBodyPartsGpu(Array<T>& poseKeypoints, Array<T>& poseScores, const T* const heatMapGpuPtr,
                              const T* const peaksPtr, const PoseModel poseModel, const Point<int>& heatMapSize,
@@ -130,11 +336,53 @@ namespace op
                 error("The pointers bodyPartPairsGpuPtr and mapIdxGpuPtr cannot be nullptr.",
                       __LINE__, __FUNCTION__, __FILE__);
 
+            // const auto REPS = 1000;
+            // double timeNormalize0 = 0.;
+            // double timeNormalize1 = 0.;
+            // double timeNormalize2 = 0.;
+
+            // // Old - Non-efficient code
+            // OP_CUDA_PROFILE_INIT(REPS);
+            // // Run Kernel - pairScoresGpu
+            // const dim3 THREADS_PER_BLOCK{4, 16, 16};
+            // const dim3 numBlocks{
+            //     getNumberCudaBlocks(numberBodyPartPairs, THREADS_PER_BLOCK.x),
+            //     getNumberCudaBlocks(maxPeaks, THREADS_PER_BLOCK.y),
+            //     getNumberCudaBlocks(maxPeaks, THREADS_PER_BLOCK.z)};
+            // pafScoreKernelOld<<<numBlocks, THREADS_PER_BLOCK>>>(
+            //     pairScoresGpuPtr, heatMapGpuPtr, peaksGpuPtr, bodyPartPairsGpuPtr, mapIdxGpuPtr,
+            //     maxPeaks, (int)numberBodyPartPairs, heatMapSize.x, heatMapSize.y, interThreshold,
+            //     interMinAboveThreshold);
+            // // pairScoresCpu <-- pairScoresGpu
+            // cudaMemcpy(pairScoresCpu.getPtr(), pairScoresGpuPtr, totalComputations * sizeof(T),
+            //            cudaMemcpyDeviceToHost);
+            // // Get pair connections and their scores
+            // const auto pairConnections = pafPtrIntoVector(
+            //     pairScoresCpu, peaksPtr, maxPeaks, bodyPartPairs, numberBodyPartPairs);
+            // const auto peopleVector = pafVectorIntoPeopleVectorOld(
+            //     pairConnections, peaksPtr, maxPeaks, bodyPartPairs, numberBodyParts);
+            // // Delete people below the following thresholds:
+            //     // a) minSubsetCnt: removed if less than minSubsetCnt body parts
+            //     // b) minSubsetScore: removed if global score smaller than this
+            //     // c) maxPeaks (POSE_MAX_PEOPLE): keep first maxPeaks people above thresholds
+            // int numberPeople;
+            // std::vector<int> validSubsetIndexes;
+            // validSubsetIndexes.reserve(fastMin((size_t)maxPeaks, peopleVector.size()));
+            // removePeopleBelowThresholds(validSubsetIndexes, numberPeople, peopleVector, numberBodyParts, minSubsetCnt,
+            //                             minSubsetScore, maxPeaks, maximizePositives);
+            // // Fill and return poseKeypoints
+            // peopleVectorToPeopleArray(poseKeypoints, poseScores, scaleFactor, peopleVector, validSubsetIndexes,
+            //                           peaksPtr, numberPeople, numberBodyParts, numberBodyPartPairs);
+            // OP_PROFILE_END(timeNormalize1, 1e3, REPS);
+
+            // Efficient code
+            // OP_CUDA_PROFILE_INIT(REPS);
             // Run Kernel - pairScoresGpu
+            const dim3 THREADS_PER_BLOCK{128, 1, 1};
             const dim3 numBlocks{
-                getNumberCudaBlocks(numberBodyPartPairs, THREADS_PER_BLOCK.x),
+                getNumberCudaBlocks(maxPeaks, THREADS_PER_BLOCK.x),
                 getNumberCudaBlocks(maxPeaks, THREADS_PER_BLOCK.y),
-                getNumberCudaBlocks(maxPeaks, THREADS_PER_BLOCK.z)};
+                getNumberCudaBlocks(numberBodyPartPairs, THREADS_PER_BLOCK.z)};
             pafScoreKernel<<<numBlocks, THREADS_PER_BLOCK>>>(
                 pairScoresGpuPtr, heatMapGpuPtr, peaksGpuPtr, bodyPartPairsGpuPtr, mapIdxGpuPtr,
                 maxPeaks, (int)numberBodyPartPairs, heatMapSize.x, heatMapSize.y, interThreshold,
@@ -142,16 +390,12 @@ namespace op
             // pairScoresCpu <-- pairScoresGpu
             cudaMemcpy(pairScoresCpu.getPtr(), pairScoresGpuPtr, totalComputations * sizeof(T),
                        cudaMemcpyDeviceToHost);
-
-            // New code
             // Get pair connections and their scores
             const auto pairConnections = pafPtrIntoVector(
                 pairScoresCpu, peaksPtr, maxPeaks, bodyPartPairs, numberBodyPartPairs);
             const auto peopleVector = pafVectorIntoPeopleVector(
                 pairConnections, peaksPtr, maxPeaks, bodyPartPairs, numberBodyParts);
-
-            // // Old code
-            // // Get pair connections and their scores
+            // // Old code: Get pair connections and their scores
             // // std::vector<std::pair<std::vector<int>, double>> refers to:
             // //     - std::vector<int>: [body parts locations, #body parts found]
             // //     - double: person subset score
@@ -159,7 +403,6 @@ namespace op
             // const auto peopleVector = createPeopleVector(
             //     tNullptr, peaksPtr, poseModel, heatMapSize, maxPeaks, interThreshold, interMinAboveThreshold,
             //     bodyPartPairs, numberBodyParts, numberBodyPartPairs, pairScoresCpu);
-
             // Delete people below the following thresholds:
                 // a) minSubsetCnt: removed if less than minSubsetCnt body parts
                 // b) minSubsetScore: removed if global score smaller than this
@@ -169,10 +412,14 @@ namespace op
             validSubsetIndexes.reserve(fastMin((size_t)maxPeaks, peopleVector.size()));
             removePeopleBelowThresholds(validSubsetIndexes, numberPeople, peopleVector, numberBodyParts, minSubsetCnt,
                                         minSubsetScore, maxPeaks, maximizePositives);
-
             // Fill and return poseKeypoints
             peopleVectorToPeopleArray(poseKeypoints, poseScores, scaleFactor, peopleVector, validSubsetIndexes,
                                       peaksPtr, numberPeople, numberBodyParts, numberBodyPartPairs);
+            // OP_PROFILE_END(timeNormalize2, 1e3, REPS);
+
+            // // Profiling verbose
+            // log("  BPC(ori)=" + std::to_string(timeNormalize1) + "ms");
+            // log("  BPC(new)=" + std::to_string(timeNormalize2) + "ms");
 
             // Sanity check
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
diff --git a/src/openpose/net/nmsBase.cu b/src/openpose/net/nmsBase.cu
index 9a38d5eca..d85f23a49 100644
--- a/src/openpose/net/nmsBase.cu
+++ b/src/openpose/net/nmsBase.cu
@@ -9,66 +9,193 @@ namespace op
     const auto THREADS_PER_BLOCK_1D = 16u;
     const auto THREADS_PER_BLOCK = 512u;
 
+    // template <typename T>
+    // __global__ void nmsRegisterKernelOld(
+    //     int* kernelPtr, const T* const sourcePtr, const int w, const int h, const T threshold)
+    // {
+    //     // get pixel location (x,y)
+    //     const auto x = blockIdx.x * blockDim.x + threadIdx.x;
+    //     const auto y = blockIdx.y * blockDim.y + threadIdx.y;
+    //     const auto index = y*w + x;
+
+    //     if (0 < x && x < (w-1) && 0 < y && y < (h-1))
+    //     {
+    //         const auto value = sourcePtr[index];
+    //         if (value > threshold)
+    //         {
+    //             const auto topLeft     = sourcePtr[(y-1)*w + x-1];
+    //             const auto top         = sourcePtr[(y-1)*w + x];
+    //             const auto topRight    = sourcePtr[(y-1)*w + x+1];
+    //             const auto left        = sourcePtr[    y*w + x-1];
+    //             const auto right       = sourcePtr[    y*w + x+1];
+    //             const auto bottomLeft  = sourcePtr[(y+1)*w + x-1];
+    //             const auto bottom      = sourcePtr[(y+1)*w + x];
+    //             const auto bottomRight = sourcePtr[(y+1)*w + x+1];
+
+    //             if (value > topLeft && value > top && value > topRight
+    //                 && value > left && value > right
+    //                 && value > bottomLeft && value > bottom && value > bottomRight)
+    //                 kernelPtr[index] = 1;
+    //             else
+    //                 kernelPtr[index] = 0;
+    //         }
+    //         else
+    //             kernelPtr[index] = 0;
+    //     }
+    //     else if (x == 0 || x == (w-1) || y == 0 || y == (h-1))
+    //         kernelPtr[index] = 0;
+    // }
+
+    // Note: Shared memory made this function slower, from 1.2 ms to about 2 ms.
     template <typename T>
-    __global__ void nmsRegisterKernel(int* kernelPtr, const T* const sourcePtr, const int w, const int h,
-                                      const T threshold)
+    __global__ void nmsRegisterKernel(
+        int* kernelPtr, const T* const sourcePtr, const int w, const int h, const T threshold)
     {
         // get pixel location (x,y)
         const auto x = blockIdx.x * blockDim.x + threadIdx.x;
         const auto y = blockIdx.y * blockDim.y + threadIdx.y;
+        const auto channel = blockIdx.z * blockDim.z + threadIdx.z;
+        const auto channelOffset = channel * w*h;
         const auto index = y*w + x;
 
+        auto* kernelPtrOffset = &kernelPtr[channelOffset];
+        const T* const sourcePtrOffset = &sourcePtr[channelOffset];
+
         if (0 < x && x < (w-1) && 0 < y && y < (h-1))
         {
-            const auto value = sourcePtr[index];
+            const auto value = sourcePtrOffset[index];
             if (value > threshold)
             {
-                const auto topLeft     = sourcePtr[(y-1)*w + x-1];
-                const auto top         = sourcePtr[(y-1)*w + x];
-                const auto topRight    = sourcePtr[(y-1)*w + x+1];
-                const auto left        = sourcePtr[    y*w + x-1];
-                const auto right       = sourcePtr[    y*w + x+1];
-                const auto bottomLeft  = sourcePtr[(y+1)*w + x-1];
-                const auto bottom      = sourcePtr[(y+1)*w + x];
-                const auto bottomRight = sourcePtr[(y+1)*w + x+1];
+                const auto topLeft     = sourcePtrOffset[(y-1)*w + x-1];
+                const auto top         = sourcePtrOffset[(y-1)*w + x];
+                const auto topRight    = sourcePtrOffset[(y-1)*w + x+1];
+                const auto left        = sourcePtrOffset[    y*w + x-1];
+                const auto right       = sourcePtrOffset[    y*w + x+1];
+                const auto bottomLeft  = sourcePtrOffset[(y+1)*w + x-1];
+                const auto bottom      = sourcePtrOffset[(y+1)*w + x];
+                const auto bottomRight = sourcePtrOffset[(y+1)*w + x+1];
 
                 if (value > topLeft && value > top && value > topRight
                     && value > left && value > right
                     && value > bottomLeft && value > bottom && value > bottomRight)
-                    kernelPtr[index] = 1;
+                    kernelPtrOffset[index] = 1;
                 else
-                    kernelPtr[index] = 0;
+                    kernelPtrOffset[index] = 0;
             }
             else
-                kernelPtr[index] = 0;
+                kernelPtrOffset[index] = 0;
         }
         else if (x == 0 || x == (w-1) || y == 0 || y == (h-1))
-            kernelPtr[index] = 0;
+            kernelPtrOffset[index] = 0;
     }
 
+    // template <typename T>
+    // __global__ void writeResultKernelOld(
+    //     T* output, const int length, const int* const kernelPtr, const T* const sourcePtr, const int width,
+    //     const int height, const int maxPeaks, const T offsetX, const T offsetY)
+    // {
+    //     __shared__ int local[THREADS_PER_BLOCK+1]; // one more
+    //     const auto globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
+
+    //     if (globalIdx < length)
+    //     {
+    //         local[threadIdx.x] = kernelPtr[globalIdx];
+    //         //last thread in the block but not globally last, load one more
+    //         if (threadIdx.x == THREADS_PER_BLOCK - 1 && globalIdx != length - 1)
+    //             local[threadIdx.x+1] = kernelPtr[globalIdx+1];
+
+    //         __syncthreads();
+    //         // See difference, except the globally last one
+    //         if (globalIdx != length - 1)
+    //         {
+    //             // A[globalIdx] == A[globalIdx + 1] means no peak
+    //             if (local[threadIdx.x] != local[threadIdx.x + 1])
+    //             {
+    //                 const auto peakIndex = kernelPtr[globalIdx]; //0-index
+    //                 const auto peakLocX = (int)(globalIdx % width);
+    //                 const auto peakLocY = (int)(globalIdx / width);
+
+    //                 // Accurate peak location: considered neighboors
+    //                 if (peakIndex < maxPeaks) // limitation
+    //                 {
+    //                     T xAcc = 0.f;
+    //                     T yAcc = 0.f;
+    //                     T scoreAcc = 0.f;
+    //                     const auto dWidth = 3;
+    //                     const auto dHeight = 3;
+    //                     for (auto dy = -dHeight ; dy <= dHeight ; dy++)
+    //                     {
+    //                         const auto y = peakLocY + dy;
+    //                         if (0 <= y && y < height) // Default height = 368
+    //                         {
+    //                             for (auto dx = -dWidth ; dx <= dWidth ; dx++)
+    //                             {
+    //                                 const auto x = peakLocX + dx;
+    //                                 if (0 <= x && x < width) // Default width = 656
+    //                                 {
+    //                                     const auto score = sourcePtr[y * width + x];
+    //                                     if (score > 0)
+    //                                     {
+    //                                         xAcc += x*score;
+    //                                         yAcc += y*score;
+    //                                         scoreAcc += score;
+    //                                     }
+    //                                 }
+    //                             }
+    //                         }
+    //                     }
+
+    //                     // Offset to keep Matlab format (empirically higher acc)
+    //                     // Best results for 1 scale: x + 0, y + 0.5
+    //                     // +0.5 to both to keep Matlab format
+    //                     const auto outputIndex = (peakIndex + 1) * 3;
+    //                     output[outputIndex] = xAcc / scoreAcc + offsetX;
+    //                     output[outputIndex + 1] = yAcc / scoreAcc + offsetY;
+    //                     output[outputIndex + 2] = sourcePtr[peakLocY*width + peakLocX];
+    //                 }
+    //             }
+    //         }
+    //         // If index 0 --> Assign number of peaks (truncated to the maximum possible number of peaks)
+    //         else
+    //             output[0] = (kernelPtr[globalIdx] < maxPeaks ? kernelPtr[globalIdx] : maxPeaks);
+    //     }
+    // }
+
     template <typename T>
     __global__ void writeResultKernel(
         T* output, const int length, const int* const kernelPtr, const T* const sourcePtr, const int width,
-        const int height, const int maxPeaks, const T offsetX, const T offsetY)
+        const int height, const int maxPeaks, const T offsetX, const T offsetY, const int offsetTarget)
     {
         __shared__ int local[THREADS_PER_BLOCK+1]; // one more
+        __shared__ int kernel0; // Offset for kernel
         const auto globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
+        const auto channel = blockIdx.y * blockDim.y + threadIdx.y;
+        const auto channelOffsetSource = channel * width*height;
+        const auto channelOffset = channel * offsetTarget;
+
+        // We need to substract the peak at pixel 0 of the current channel for all values
+        if (threadIdx.x == 0)
+            kernel0 = kernelPtr[channelOffsetSource];
+        __syncthreads();
 
         if (globalIdx < length)
         {
-            local[threadIdx.x] = kernelPtr[globalIdx];
+            auto* outputOffset = &output[channelOffset];
+            const auto* const kernelPtrOffset = &kernelPtr[channelOffsetSource];
+            const auto* const sourcePtrOffset = &sourcePtr[channelOffsetSource];
+            local[threadIdx.x] = kernelPtrOffset[globalIdx] - kernel0;
             //last thread in the block but not globally last, load one more
             if (threadIdx.x == THREADS_PER_BLOCK - 1 && globalIdx != length - 1)
-                local[threadIdx.x+1] = kernelPtr[globalIdx+1];
-
+                local[threadIdx.x+1] = kernelPtrOffset[globalIdx+1] - kernel0;
             __syncthreads();
+
             // See difference, except the globally last one
             if (globalIdx != length - 1)
             {
                 // A[globalIdx] == A[globalIdx + 1] means no peak
                 if (local[threadIdx.x] != local[threadIdx.x + 1])
                 {
-                    const auto peakIndex = kernelPtr[globalIdx]; //0-index
+                    const auto peakIndex = local[threadIdx.x]; //0-index
                     const auto peakLocX = (int)(globalIdx % width);
                     const auto peakLocY = (int)(globalIdx / width);
 
@@ -90,7 +217,7 @@ namespace op
                                     const auto x = peakLocX + dx;
                                     if (0 <= x && x < width) // Default width = 656
                                     {
-                                        const auto score = sourcePtr[y * width + x];
+                                        const auto score = sourcePtrOffset[y * width + x];
                                         if (score > 0)
                                         {
                                             xAcc += x*score;
@@ -106,56 +233,18 @@ namespace op
                         // Best results for 1 scale: x + 0, y + 0.5
                         // +0.5 to both to keep Matlab format
                         const auto outputIndex = (peakIndex + 1) * 3;
-                        output[outputIndex] = xAcc / scoreAcc + offsetX;
-                        output[outputIndex + 1] = yAcc / scoreAcc + offsetY;
-                        output[outputIndex + 2] = sourcePtr[peakLocY*width + peakLocX];
+                        outputOffset[outputIndex] = xAcc / scoreAcc + offsetX;
+                        outputOffset[outputIndex + 1] = yAcc / scoreAcc + offsetY;
+                        outputOffset[outputIndex + 2] = sourcePtrOffset[peakLocY*width + peakLocX];
                     }
                 }
             }
             // If index 0 --> Assign number of peaks (truncated to the maximum possible number of peaks)
             else
-                output[0] = (kernelPtr[globalIdx] < maxPeaks ? kernelPtr[globalIdx] : maxPeaks);
+                outputOffset[0] = (local[threadIdx.x] < maxPeaks ? local[threadIdx.x] : maxPeaks);
         }
     }
 
-    // template <typename T>
-    // __global__ void sortKernel(T* targetPtr, const int channels, const int offsetTarget)
-    // {
-    //     const auto globalIdx = blockIdx.x * blockDim.x + threadIdx.x;
-
-    //     if (globalIdx < channels)
-    //     {
-    //         const auto totalOffset = globalIdx * offsetTarget;
-    //         const int nonZeroElementsPlus1 = targetPtr[totalOffset]+1;
-    //         for (auto i = 1 ; i < nonZeroElementsPlus1 ; i++)
-    //         {
-    //             // Find new maximum
-    //             const auto iIndex = totalOffset+3*i;
-    //             int maxIndex = i;
-    //             T maxIndexValue = targetPtr[iIndex+2];
-    //             for (auto j = i+1 ; j < nonZeroElementsPlus1 ; j++)
-    //             {
-    //                 if (maxIndexValue < targetPtr[totalOffset+3*j+2])
-    //                 {
-    //                     maxIndex = j;
-    //                     maxIndexValue = targetPtr[totalOffset+3*j+2];
-    //                 }
-    //             }
-    //             // Swap
-    //             const auto jIndex = totalOffset+3*maxIndex;
-    //             const T temp [3] = {targetPtr[iIndex],
-    //                                 targetPtr[iIndex+1],
-    //                                 targetPtr[iIndex+2]};
-    //             targetPtr[iIndex] = targetPtr[jIndex];
-    //             targetPtr[iIndex+1] = targetPtr[jIndex+1];
-    //             targetPtr[iIndex+2] = targetPtr[jIndex+2];
-    //             targetPtr[jIndex] = temp[0];
-    //             targetPtr[jIndex+1] = temp[1];
-    //             targetPtr[jIndex+2] = temp[2];
-    //         }
-    //     }
-    // }
-
     template <typename T>
     void nmsGpu(T* targetPtr, int* kernelPtr, const T* const sourcePtr, const T threshold,
                 const std::array<int, 4>& targetSize, const std::array<int, 4>& sourceSize, const Point<T>& offset)
@@ -188,54 +277,72 @@ namespace op
             // log("width_t: " + std::to_string(targetSize[3]));     // = 3 = [x, y, score]
             // log("");
 
-            for (auto n = 0; n < num; n++)
-            {
-                for (auto c = 0; c < channels; c++)
-                {
-                    // log("channel: " + std::to_string(c));
-                    const auto offsetChannel = (n * channels + c);
-                    auto* kernelPtrOffsetted = kernelPtr + offsetChannel * imageOffset;
-                    const auto* const sourcePtrOffsetted = sourcePtr + offsetChannel * imageOffset;
-                    auto* targetPtrOffsetted = targetPtr + offsetChannel * offsetTarget;
-
-                    // This returns kernelPtrOffsetted, a binary array with 0s & 1s. 1s in the local maximum
-                    // positions (size = size(sourcePtrOffsetted))
-                    // Example result: [0,0,0,0,1,0,0,0,0,1,0,0,0,0]
-                    nmsRegisterKernel<<<numBlocks2D, threadsPerBlock2D>>>(
-                        kernelPtrOffsetted, sourcePtrOffsetted, width, height, threshold);
-                    // // Debug
-                    // if (c==3)
-                    // {
-                    //     char filename[50];
-                    //     sprintf(filename, "work%02d.txt", c);
-                    //     std::ofstream fout(filename);
-                    //     int* kernelPtrOffsetted_local = mKernelBlob.mutable_cpu_data()
-                    //                                   + n * parts_num * imageOffset + c * imageOffset;
-                    //     for (int y = 0; y < height; y++){
-                    //         for (int x = 0; x < width; x++)
-                    //             fout << kernelPtrOffsetted_local[y*width + x] << "\t";
-                    //         fout<< std::endl;
-                    //     }
-                    //     fout.close();
-                    // }
-                    auto kernelThrustPtr = thrust::device_pointer_cast(kernelPtrOffsetted);
-
-                    // This modifies kernelPtrOffsetted, now it indicates the local maximum indexes
-                    // Format: 0,0,0,1,1,1,1,2,2,2,... First maximum at index 2, second at 6, etc...
-                    // Example result: [0,0,0,0,0,1,1,1,1,1,2,2,2,2]
-                    thrust::exclusive_scan(kernelThrustPtr, kernelThrustPtr + imageOffset, kernelThrustPtr);
-
-                    // This returns targetPtrOffsetted, with the NMS applied over it
-                    writeResultKernel<<<numBlocks1D, threadsPerBlock1D>>>(
-                        targetPtrOffsetted, imageOffset, kernelPtrOffsetted, sourcePtrOffsetted, width, height,
-                        maxPeaks, offset.x, offset.y);
+            // // Old code: Running 3 kernels per channel
+            // // const auto REPS = 1;
+            // const auto REPS = 1000;
+            // double timeNormalize1 = 0.;
+            // double timeNormalize2 = 0.;
+            // OP_CUDA_PROFILE_INIT(REPS);
+            // for (auto n = 0; n < num; n++)
+            // {
+            //     for (auto c = 0; c < channels; c++)
+            //     {
+            //         // log("channel: " + std::to_string(c));
+            //         const auto offsetChannel = (n * channels + c);
+            //         auto* kernelPtrOffsetted = kernelPtr + offsetChannel * imageOffset;
+            //         const auto* const sourcePtrOffsetted = sourcePtr + offsetChannel * imageOffset;
+            //         auto* targetPtrOffsetted = targetPtr + offsetChannel * offsetTarget;
+            //         // This returns kernelPtrOffsetted, a binary array with 0s & 1s. 1s in the local maximum
+            //         // positions (size = size(sourcePtrOffsetted))
+            //         // Example result: [0,0,0,0,1,0,0,0,0,1,0,0,0,0]
+            //         nmsRegisterKernelOld<<<numBlocks2D, threadsPerBlock2D>>>(
+            //             kernelPtrOffsetted, sourcePtrOffsetted, width, height, threshold);
+            //         // This modifies kernelPtrOffsetted, now it indicates the local maximum indexes
+            //         // Format: 0,0,0,1,1,1,1,2,2,2,... First maximum at index 2, second at 6, etc...
+            //         // Example result: [0,0,0,0,0,1,1,1,1,1,2,2,2,2]
+            //         auto kernelThrustPtr = thrust::device_pointer_cast(kernelPtrOffsetted);
+            //         thrust::exclusive_scan(kernelThrustPtr, kernelThrustPtr + imageOffset, kernelThrustPtr);
+            //         // This returns targetPtrOffsetted, with the NMS applied over it
+            //         writeResultKernelOld<<<numBlocks1D, threadsPerBlock1D>>>(
+            //             targetPtrOffsetted, imageOffset, kernelPtrOffsetted, sourcePtrOffsetted, width, height,
+            //             maxPeaks, offset.x, offset.y);
+            //     }
+            // }
+            // OP_CUDA_PROFILE_END(timeNormalize1, 1e3, REPS);
+            // OP_CUDA_PROFILE_INIT(REPS);
 
-                }
-                // // Sort based on score
-                // // Commented because it doesn't change accuracy
-                // // TODO: If finally used, implement for CPU/CL versions
-                // sortKernel<<<numBlocksSort, threadsPerBlockSort>>>(targetPtr, channels, offsetTarget);
-            }
+            // Optimized code: Running 3 kernels in total
+            // This returns kernelPtr, a binary array with 0s & 1s. 1s in the local maximum
+            // positions (size = size(sourcePtrOffsetted))
+            // Example result: [0,0,0,0,1,0,0,0,0,1,0,0,0,0]
+            // time = 1.24 ms
+            const dim3 threadsPerBlockRegister{THREADS_PER_BLOCK_1D, THREADS_PER_BLOCK_1D, 1};
+            const dim3 numBlocksRegister{getNumberCudaBlocks(width, threadsPerBlockRegister.x),
+                                         getNumberCudaBlocks(height, threadsPerBlockRegister.y),
+                                         getNumberCudaBlocks(num * channels, threadsPerBlockRegister.z)};
+            nmsRegisterKernel<<<numBlocksRegister, threadsPerBlockRegister>>>(
+                kernelPtr, sourcePtr, width, height, threshold);
+            // This modifies kernelPtrOffsetted, now it indicates the local maximum indexes
+            // Format: 0,0,0,1,1,1,1,2,2,2,... First maximum at index 2, second at 6, etc...
+            // Example result: [0,0,0,0,0,1,1,1,1,1,2,2,2,2]
+            // time = 2.71 ms
+            auto kernelThrustPtr = thrust::device_pointer_cast(kernelPtr);
+            thrust::exclusive_scan(kernelThrustPtr, kernelThrustPtr + num*channels*imageOffset, kernelThrustPtr);
+            // This returns targetPtrOffsetted, with the NMS applied over it
+            // time = 1.10 ms
+            const dim3 threadsPerBlockWrite{THREADS_PER_BLOCK, 1};
+            const dim3 numBlocksWrite{getNumberCudaBlocks(imageOffset, threadsPerBlockWrite.x),
+                                      getNumberCudaBlocks(num * channels, threadsPerBlockWrite.z)};
+            writeResultKernel<<<numBlocksWrite, threadsPerBlockWrite>>>(
+                targetPtr, imageOffset, kernelPtr, sourcePtr, width, height,
+                maxPeaks, offset.x, offset.y, offsetTarget);
+
+            // // Profiling code
+            // OP_CUDA_PROFILE_END(timeNormalize2, 1e3, REPS);
+            // log("  NMS1(or)=" + std::to_string(timeNormalize1) + "ms");
+            // log("  NMS2(1k)=" + std::to_string(timeNormalize2) + "ms");
+
+            // Sanity check
             cudaCheck(__LINE__, __FUNCTION__, __FILE__);
         }
         catch (const std::exception& e)