Merge branch 'branch-0.5' into python37

revans2 · Jan 14, 2019 · 43f2dbd · 43f2dbd
2 parents 818c819 + ad0e8af
commit 43f2dbd
Show file tree

Hide file tree

Showing 35 changed files with 1,149 additions and 479 deletions.
diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md
@@ -1,26 +1,22 @@
 <!--
 
-Thanks for wanting to contribute to cuDF :)
+Thank you for contributing to cuDF :)
 
-First, if you need some help or want to chat to the core developers, please
-visit https://rapids.ai/community.html for links to our Google Group and other
-communication channels.
+Here are some guidelines to help the review process go smoothly.
 
-Here's some guidelines to help the review process go smoothly.
-
-0. Please write a description in this text box of the changes that are being
+1. Please write a description in this text box of the changes that are being
    made.
 
-1. Please ensure that you have written units tests for the changes made/features
+2. Please ensure that you have written units tests for the changes made/features
    added.
 
-2. If you are closing an issue please use one of the automatic closing words as
+3. If you are closing an issue please use one of the automatic closing words as
    noted here: https://help.github.com/articles/closing-issues-using-keywords/
 
-3. If your pull request is not ready for review but you want to make use of the
+4. If your pull request is not ready for review but you want to make use of the
    continuous integration testing facilities please label it with `[WIP]`.
 
-4. If your pull request is ready to be reviewed without requiring additional
+5. If your pull request is ready to be reviewed without requiring additional
    work on top of it, then remove the `[WIP]` label (if present) and replace
    it with `[REVIEW]`. If assistance is required to complete the functionality,
    for example when the C/C++ code of a feature is complete but Python bindings
@@ -34,7 +30,7 @@ Here's some guidelines to help the review process go smoothly.
    opened by the person assisting, which then will be the PR that will be
    merged.
 
-5. Once all work has been done and review has taken place please do not add
+6. Once all work has been done and review has taken place please do not add
    features or make changes out of the scope of those requested by the reviewer
    (doing this just add delays as already reviewed code ends up having to be
    re-reviewed/it is hard to tell what is new etc!). Further, please do not

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,8 +19,10 @@
 - PR #564 Update python `sort_values` method to use updated libcudf `gdf_order_by` API
 - PR #509 CSV Reader: Input CSV file can now be passed in as a text or a binary buffer
 - PR #607 Add `__iter__` and iteritems to DataFrame class
+- PR #600 Enable deep or shallow copy
 - PR #635 Add Doxygen template
 - PR #649 Add `cudf.from_pandas` function
+- PR #633 CSV Reader: Add support for the nrows parameter to specify the number of rows to read from the input file
 
 ## Improvements
 
@@ -32,17 +34,20 @@
 - PR #502 Simplify Dockerfile for local dev, eliminate old conda/pip envs
 - PR #549 Adds `-rdynamic` compiler flag to nvcc for Debug builds
 - PR #472 RMM: Created centralized rmm::device_vector alias and rmm::exec_policy
+- PR #500 Improved the concurrent hash map class to support partitioned (multi-pass) hash table building
 - PR #617 Added .dockerignore file. Prevents adding stale cmake cache files to the docker container
 - PR #658 Reduced `JOIN_TEST` time by isolating overflow test of hash table size computation
+- PR #651 Remove noqa marks in `__init__.py` files
+- PR #671 CSV Reader: uncompressed buffer input can be parsed without explicitly specifying compression as None
 
 ## Bug Fixes
 
+- PR #569 CSV Reader: Fix days being off-by-one when parsing some dates
 - PR #531 CSV Reader: Fix incorrect parsing of quoted numbers
 - PR #465 Added templated C++ API for RMM to avoid explicit cast to `void**`
 - PR #473 Added missing <random> include
 - PR #478 CSV Reader: Add api support for auto column detection, header, mangle_dupe_cols, usecols
 - PR #495 Updated README to correct where cffi pytest should be executed
-- PR #500 Improved the concurrent hash map class to support partitioned (multi-pass) hash table building
 - PR #501 Fix the intermittent segfault caused by the `thousands` and `compression` parameters in the csv reader
 - PR #502 Simplify Dockerfile for local dev, eliminate old conda/pip envs
 - PR #512 fix bug for `on` parameter in `DataFrame.merge` to allow for None or single column name
@@ -56,9 +61,16 @@
 - PR #612 Prevent an exception from occuring with true division on integer series.
 - PR #630 Fix deprecation warning for `pd.core.common.is_categorical_dtype`
 - PR #622 Fix Series.append() behaviour when appending values with different numeric dtype
+- PR #644 Fix return type and column support of dataframe.quantile()
 - PR #634 Fix create `DataFrame.from_pandas()` with numeric column names
+- PR #654 Add resolution check for GDF_TIMESTAMP in Join
 - PR #648 Enforce one-to-one copy required when using `numba>=0.42.0`
 - PR #645 Fix cmake build type handling not setting debug options when CMAKE_BUILD_TYPE=="Debug"
+- PR #665 Reworked the hash map to add a way to report the destination partition for a key
+- PR #670 CMAKE: Fix env include path taking precedence over libcudf source headers
+- PR #674 Check for gdf supported column types
+- PR #677 Fix 'gdf_csv_test_Dates' gtest failure due to missing nrows parameter
+- PR #686 Fix converting nulls to NaT values when converting Series to Pandas/Numpy
 
 
 # cuDF 0.4.0 (05 Dec 2018)

diff --git a/ci/checks/style.sh b/ci/checks/style.sh
@@ -12,7 +12,7 @@ PATH=/conda/bin:$PATH
 source activate gdf
 
 # Run flake8 and get results/return code
-FLAKE=`flake8`
+FLAKE=`flake8 python`
 RETVAL=$?
 
 # Output results if failure otherwise show pass

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -22,7 +22,7 @@ project(CUDA_DATAFRAME VERSION 0.4.0 LANGUAGES C CXX CUDA)
 
 # Set a default build type if none was specified
 set(DEFAULT_BUILD_TYPE "Release")
- 
+
 if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
   message(STATUS "Setting build type to '${DEFAULT_BUILD_TYPE}' since none specified.")
   set(CMAKE_BUILD_TYPE "${DEFAULT_BUILD_TYPE}" CACHE
@@ -98,9 +98,9 @@ endif()
 
 find_package(ZLIB REQUIRED)
 if(ZLIB_FOUND)
-    include_directories(${ZLIB_INCLUDE_DIRS})
+    message(STATUS "ZLib found in ${ZLIB_INCLUDE_DIRS}")
 else()
-    message(FATAL_ERROR "ZLib not found.")
+    message(FATAL_ERROR "ZLib not found, please check your settings.")
 endif(ZLIB_FOUND)
 
 ###################################################################################################
@@ -130,7 +130,8 @@ include_directories("${ARROW_INCLUDE_DIR}"
                     "${CMAKE_SOURCE_DIR}/src"
                     "${CMAKE_SOURCE_DIR}/thirdparty/cub"
                     "${CMAKE_SOURCE_DIR}/thirdparty/moderngpu/src"
-                    "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include")
+                    "${CMAKE_SOURCE_DIR}/thirdparty/cnmem/include"
+                    "${ZLIB_INCLUDE_DIRS}")
 
 ###################################################################################################
 # - library paths ---------------------------------------------------------------------------------

diff --git a/cpp/include/cudf/io_types.h b/cpp/include/cudf/io_types.h
@@ -67,8 +67,8 @@ typedef struct {
   const char    **use_cols_char;            // array of char:    Return a subset of the columns.  CSV reader will only process those columns,  another read is needed to get full data
   int           use_cols_char_len;          // int:    number of elements in list of returned columns
 
-  long          skiprows;                   /**< number of rows at the start of the files to skip, default is 0                                 */
-  long          skipfooter;                 /**< number of rows at the bottom of the file to skip - default is 0                                */
+  gdf_size_type skiprows;                   /**< number of rows at the start of the files to skip, default is 0                                 */
+  gdf_size_type skipfooter;                 /**< number of rows at the bottom of the file to skip - default is 0                                */
 
   bool          skip_blank_lines;           // whether or not to ignore blank lines
 

diff --git a/cpp/include/cudf/types.h b/cpp/include/cudf/types.h
@@ -39,28 +39,29 @@ typedef enum {
 /* ----------------------------------------------------------------------------*/
 typedef enum {
     GDF_SUCCESS=0,                
-    GDF_CUDA_ERROR,                   /**< Error occured in a CUDA call */ 
-    GDF_UNSUPPORTED_DTYPE,            /**< The datatype of the gdf_column is unsupported */ 
-    GDF_COLUMN_SIZE_MISMATCH,         /**< Two columns that should be the same size aren't the same size*/        
-    GDF_COLUMN_SIZE_TOO_BIG,          /**< Size of column is larger than the max supported size */      
-    GDF_DATASET_EMPTY,                /**< Input dataset is either null or has size 0 when it shouldn't */   
-    GDF_VALIDITY_MISSING,             /**< gdf_column's validity bitmask is null */  
-    GDF_VALIDITY_UNSUPPORTED,         /**< The requested gdf operation does not support validity bitmask handling, and one of the input columns has the valid bits enabled */
-    GDF_INVALID_API_CALL,             /**< The arguments passed into the function were invalid */   
-    GDF_JOIN_DTYPE_MISMATCH,          /**< Datatype mismatch between corresponding columns in  left/right tables in the Join function */   
-    GDF_JOIN_TOO_MANY_COLUMNS,        /**< Too many columns were passed in for the requested join operation*/       
-    GDF_DTYPE_MISMATCH,               /**< Type mismatch between columns that should be the same type */
-    GDF_UNSUPPORTED_METHOD,           /**< The method requested to perform an operation was invalid or unsupported (e.g., hash vs. sort)*/ 
-    GDF_INVALID_AGGREGATOR,           /**< Invalid aggregator was specified for a groupby*/
-    GDF_INVALID_HASH_FUNCTION,        /**< Invalid hash function was selected */
-    GDF_PARTITION_DTYPE_MISMATCH,     /**< Datatype mismatch between columns of input/output in the hash partition function */
-    GDF_HASH_TABLE_INSERT_FAILURE,    /**< Failed to insert to hash table, likely because its full */
-    GDF_UNSUPPORTED_JOIN_TYPE,        /**< The type of join requested is unsupported */
-    GDF_C_ERROR,                      /**< C error not related to CUDA */
-    GDF_FILE_ERROR,                   /**< error processing sepcified file */      
-    GDF_MEMORYMANAGER_ERROR,          /**< Memory manager error (see memory.h) */
-    GDF_UNDEFINED_NVTX_COLOR,         /**< The requested color used to define an NVTX range is not defined */
-    GDF_NULL_NVTX_NAME,               /**< The requested name for an NVTX range cannot be nullptr */
+    GDF_CUDA_ERROR,                    /**< Error occured in a CUDA call */ 
+    GDF_UNSUPPORTED_DTYPE,             /**< The datatype of the gdf_column is unsupported */ 
+    GDF_COLUMN_SIZE_MISMATCH,          /**< Two columns that should be the same size aren't the same size*/        
+    GDF_COLUMN_SIZE_TOO_BIG,           /**< Size of column is larger than the max supported size */      
+    GDF_DATASET_EMPTY,                 /**< Input dataset is either null or has size 0 when it shouldn't */   
+    GDF_VALIDITY_MISSING,              /**< gdf_column's validity bitmask is null */  
+    GDF_VALIDITY_UNSUPPORTED,          /**< The requested gdf operation does not support validity bitmask handling, and one of the input columns has the valid bits enabled */
+    GDF_INVALID_API_CALL,              /**< The arguments passed into the function were invalid */   
+    GDF_JOIN_DTYPE_MISMATCH,           /**< Datatype mismatch between corresponding columns in  left/right tables in the Join function */   
+    GDF_JOIN_TOO_MANY_COLUMNS,         /**< Too many columns were passed in for the requested join operation*/       
+    GDF_DTYPE_MISMATCH,                /**< Type mismatch between columns that should be the same type */
+    GDF_UNSUPPORTED_METHOD,            /**< The method requested to perform an operation was invalid or unsupported (e.g., hash vs. sort)*/ 
+    GDF_INVALID_AGGREGATOR,            /**< Invalid aggregator was specified for a groupby*/
+    GDF_INVALID_HASH_FUNCTION,         /**< Invalid hash function was selected */
+    GDF_PARTITION_DTYPE_MISMATCH,      /**< Datatype mismatch between columns of input/output in the hash partition function */
+    GDF_HASH_TABLE_INSERT_FAILURE,     /**< Failed to insert to hash table, likely because its full */
+    GDF_UNSUPPORTED_JOIN_TYPE,         /**< The type of join requested is unsupported */
+    GDF_C_ERROR,                       /**< C error not related to CUDA */
+    GDF_FILE_ERROR,                    /**< error processing sepcified file */      
+    GDF_MEMORYMANAGER_ERROR,           /**< Memory manager error (see memory.h) */
+    GDF_UNDEFINED_NVTX_COLOR,          /**< The requested color used to define an NVTX range is not defined */
+    GDF_NULL_NVTX_NAME,                /**< The requested name for an NVTX range cannot be nullptr */
+    GDF_TIMESTAMP_RESOLUTION_MISMATCH, /**< Resolution mismatch between two columns of GDF_TIMESTAMP */
     N_GDF_ERRORS
 } gdf_error;
 

diff --git a/cpp/src/hash/concurrent_unordered_multimap.cuh b/cpp/src/hash/concurrent_unordered_multimap.cuh
@@ -387,7 +387,72 @@ public:
     {
         return unused_key;
     }
-
+
+    /* --------------------------------------------------------------------------*/
+    /**
+     * @Synopsis Computes a hash value for a key
+     *
+     * @Param[in] the_key The key to compute a hash for
+     * @tparam hash_value_type The datatype of the hash value
+     *
+     * @Returns   The hash value for the key
+     */
+    /* ----------------------------------------------------------------------------*/
+    template <typename hash_value_type = typename Hasher::result_type>
+    __forceinline__
+    __host__ __device__ hash_value_type get_hash(const key_type& the_key) const
+    {
+        return m_hf(the_key);
+    }
+
+    /* --------------------------------------------------------------------------*/
+    /**
+     * @Synopsis Computes the destination hash map partition for a key
+     *
+     * @Param[in] the_key The key to search for
+     * @Param[in] num_parts The total number of partitions in the partitioned
+     * hash table
+     * @Param[in] precomputed_hash A flag indicating whether or not a precomputed
+     * hash value is passed in
+     * @Param[in] precomputed_hash_value A precomputed hash value to use for determing
+     * the write location of the key into the hash map instead of computing the
+     * the hash value directly from the key
+     * @tparam hash_value_type The datatype of the hash value
+     *
+     * @Returns   The destination hash table partition for the specified key
+     */
+    /* ----------------------------------------------------------------------------*/
+    template <typename hash_value_type = typename Hasher::result_type>
+    __forceinline__
+    __host__ __device__ int get_partition(const key_type& the_key,
+                                          const int num_parts = 1,
+                                          bool precomputed_hash = false,
+                                          hash_value_type precomputed_hash_value = 0) const
+    {
+        hash_value_type hash_value{0};
+
+        // If a precomputed hash value has been passed in, then use it to determine
+        // the location of the key
+        if(true == precomputed_hash) {
+          hash_value = precomputed_hash_value;
+        }
+        // Otherwise, compute the hash value from the key
+        else {
+          hash_value = m_hf(the_key);
+        }
+
+        size_type hash_tbl_idx = hash_value % m_hashtbl_size;
+
+        const size_type partition_size  = m_hashtbl_size/num_parts;
+
+        int dest_part = hash_tbl_idx/partition_size;
+        // Note that if m_hashtbl_size % num_parts != 0 then dest_part can be
+        // num_parts for the last few elements and we remap that to the
+        // num_parts-1 partition
+        if (dest_part == num_parts) dest_part = num_parts-1;
+
+        return dest_part;
+    }
 
     /* --------------------------------------------------------------------------*/
     /** 
@@ -518,8 +583,6 @@ public:
                                     hash_value_type precomputed_hash_value = 0,
                                     comparison_type keys_are_equal = key_equal())
     {
-        const size_type hashtbl_size    = m_hashtbl_size;
-
         hash_value_type hash_value{0};
 
         // If a precomputed hash value has been passed in, then use it to determine
@@ -534,13 +597,11 @@ public:
           hash_value = m_hf(x.first);
         }
 
-        size_type hash_tbl_idx = hash_value % hashtbl_size;
-
-        const size_type partition_size  = m_hashtbl_size/num_parts;
+	// Find the destination partition index 
+	int dest_part = get_partition(x.first, num_parts, true, hash_value);
 
-        // Only insert into the specified partition
-        if( ( part < (num_parts-1) && hash_tbl_idx/partition_size != part ) ||
-            ( (num_parts-1) == part && hash_tbl_idx/partition_size < part ) )
+        // Only insert if the key belongs to the specified partition
+        if ( dest_part != part )
           return end();
         else
           return insert(x, true, hash_value, keys_are_equal);
@@ -608,62 +669,6 @@ public:
         return const_iterator( m_hashtbl_values,m_hashtbl_values+m_hashtbl_size,begin_ptr);
     }
 
-    /* --------------------------------------------------------------------------*/
-    /**
-     * @Synopsis Searches for a key in the hash map partition and returns an
-     * iterator to the first instance of the key in the map, or the end()
-     * iterator if the key could not be found in the specified partition.
-     *
-     * @Param[in] the_key The key to search for
-     * @Param[in] part The partitions number for the partitioned hash table
-     * @Param[in] num_parts The total number of partitions in the partitioned
-     * hash table
-     * @Param[in] precomputed_hash A flag indicating whether or not a precomputed
-     * hash value is passed in
-     * @Param[in] precomputed_hash_value A precomputed hash value to use for determing
-     * the write location of the key into the hash map instead of computing the
-     * the hash value directly from the key
-     * @Param[in] keys_are_equal An optional functor for comparing if two keys are equal
-     * @tparam hash_value_type The datatype of the hash value
-     * @tparam comparison_type The type of the key comparison functor
-     *
-     * @Returns   An iterator to the first instance of the key in the map
-     */
-    /* ----------------------------------------------------------------------------*/
-    template < typename hash_value_type = typename Hasher::result_type,
-               typename comparison_type = key_equal>
-    __forceinline__
-    __host__ __device__ const_iterator find_part(const key_type& the_key,
-                                                 const int part = 0,
-                                                 const int num_parts = 1,
-                                                 bool precomputed_hash = false,
-                                                 hash_value_type precomputed_hash_value = 0,
-                                                 comparison_type keys_are_equal = key_equal()) const
-    {
-        hash_value_type hash_value{0};
-
-        // If a precomputed hash value has been passed in, then use it to determine
-        // the location of the key
-        if(true == precomputed_hash) {
-          hash_value = precomputed_hash_value;
-        }
-        // Otherwise, compute the hash value from the key
-        else {
-          hash_value = m_hf(the_key);
-        }
-
-        size_type hash_tbl_idx = hash_value % m_hashtbl_size;
-
-        const size_type partition_size  = m_hashtbl_size/num_parts;
-
-        // Only probe the specified partition
-        if( ( part < (num_parts-1) && hash_tbl_idx/partition_size != part ) ||
-            ( (num_parts-1) == part && hash_tbl_idx/partition_size < part ) )
-          return end();
-        else
-          return find(the_key, true, hash_value, keys_are_equal);
-    }
-
     gdf_error assign_async( const concurrent_unordered_multimap& other, cudaStream_t stream = 0 )
     {
         m_collisions = other.m_collisions;