Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

JSON reader validation of values #15968

Merged
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
bb991ef
validation of tokens code
karthikeyann Jun 11, 2024
4e707cb
fix pre-commit check failures
karthikeyann Jun 18, 2024
35a8268
Merge branch 'branch-24.08' into fea-json_spark_validation
karthikeyann Jun 18, 2024
cd6a30f
Merge branch 'branch-24.08' into fea-json_spark_validation
karthikeyann Jun 27, 2024
0c2e4da
Add Spark Compatible JSON validation (#10)
revans2 Aug 2, 2024
6a38578
Merge branch 'branch-24.08' of github.com:rapidsai/cudf into fea-json…
karthikeyann Aug 2, 2024
0d6cb12
Merge branch 'branch-24.10' of github.com:rapidsai/cudf into fea-json…
karthikeyann Aug 2, 2024
dfa6b18
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Aug 9, 2024
e944937
style fixes
karthikeyann Aug 9, 2024
23072c0
Update json normalization to take device_buffer
karthikeyann Aug 9, 2024
a885340
fix char comparison error
karthikeyann Aug 9, 2024
3867c61
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Aug 9, 2024
ab1385d
update char comparison
karthikeyann Aug 15, 2024
80c7c3a
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Aug 26, 2024
f2e2b44
rename to tabulate_output_iterator.cuh
karthikeyann Aug 26, 2024
0963218
absorb counting_iterator to tabulate_output_iterator
karthikeyann Aug 26, 2024
be7402c
update documentation
karthikeyann Aug 26, 2024
b114401
add na_values to validation
karthikeyann Aug 26, 2024
a1e9afc
add strict validation to test
karthikeyann Aug 26, 2024
ec78ef9
rename tabulate_output_iterator namespace
karthikeyann Aug 26, 2024
a225ce0
remove comments and notes
karthikeyann Aug 26, 2024
7a2a451
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Aug 26, 2024
875a72b
fix unsigned/signed issue with ARM systems
karthikeyann Sep 3, 2024
ef6f298
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 3, 2024
be7f17e
remove comments
karthikeyann Sep 3, 2024
fb62877
fix condition
karthikeyann Sep 4, 2024
e4f7d04
fix char issue with typecast
karthikeyann Sep 5, 2024
851fe3e
Update cpp/include/cudf/io/json.hpp
karthikeyann Sep 5, 2024
35e4b89
Update cpp/include/cudf/io/json.hpp
karthikeyann Sep 5, 2024
3681823
address review comments
karthikeyann Sep 5, 2024
1d897f7
fix doc
karthikeyann Sep 5, 2024
e1435ce
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 5, 2024
6bf4d3f
address review comments
karthikeyann Sep 5, 2024
e9ebb91
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 6, 2024
e093d64
address review comments
karthikeyann Sep 9, 2024
00ef690
rename lambda name
karthikeyann Sep 9, 2024
86bbeab
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 9, 2024
cecb42f
Apply suggestions from code review
karthikeyann Sep 10, 2024
9cd3098
Apply suggestions from code review
karthikeyann Sep 10, 2024
c816c73
update docs
karthikeyann Sep 10, 2024
53db703
Update cpp/include/cudf/io/json.hpp
ttnghia Sep 10, 2024
c3832b6
Update cpp/include/cudf/io/json.hpp
ttnghia Sep 10, 2024
070263e
Update cpp/include/cudf/io/json.hpp
ttnghia Sep 10, 2024
fb0e85f
fix strict_validation dependent options with if
karthikeyann Sep 10, 2024
e7fce07
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 10, 2024
252c38b
fix typo
karthikeyann Sep 10, 2024
5ab337b
Merge branch 'branch-24.10' into fea-json_spark_validation
karthikeyann Sep 10, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
rename to tabulate_output_iterator.cuh
  • Loading branch information
karthikeyann committed Aug 26, 2024
commit f2e2b448e6fc02ba178aa8eef7f243d31e628e4a
7 changes: 3 additions & 4 deletions cpp/src/io/json/process_tokens.cu
Original file line number Diff line number Diff line change
Expand Up @@ -16,8 +16,7 @@
*/

#include "nested_json.hpp"
// #include "tabulate_output_iterator.cuh"
#include "output_writer_iterator.h"
#include "tabulate_output_iterator.cuh"

#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/utilities/vector_factories.hpp>
Expand Down Expand Up @@ -231,7 +230,7 @@ void validate_token_stream(device_span<char const> d_input,
auto u_count = 0;
for (SymbolOffsetT idx = start + 1; idx < end; idx++) {
auto c = data[idx];
if (!allow_unquoted_control_chars && c >= 0 && c < 32) { return false; }
if (!allow_unquoted_control_chars && c < 32) { return false; }
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved

switch (state) {
case string_state::normal:
Expand Down Expand Up @@ -285,7 +284,7 @@ void validate_token_stream(device_span<char const> d_input,
// auto conditional_output_it = tokens.begin();
// auto conditional_output_it = thrust::make_tabulate_output_iterator(conditional_write);
auto conditional_output_it =
thrust::make_output_writer_iterator(thrust::make_counting_iterator(0), conditional_write);
thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0), conditional_write);
auto transform_op = cuda::proclaim_return_type<scan_type>(
[predicate, tokens = tokens.begin()] __device__(auto i) -> scan_type {
if (predicate(i)) return {token_t::ErrorBegin, tokens[i] == token_t::LineEnd};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
* limitations under the License.
*/

// Output writer iterator
// Tabulate Output iterator
karthikeyann marked this conversation as resolved.
Show resolved Hide resolved
#pragma once

#include <thrust/iterator/iterator_adaptor.h>
Expand All @@ -24,14 +24,14 @@ namespace detail {

// Proxy reference that calls BinaryFunction with Iterator value and the rhs of assignment operator
template <typename BinaryFunction, typename Iterator>
class output_writer_iterator_proxy {
class tabulate_output_iterator_proxy {
public:
__host__ __device__ output_writer_iterator_proxy(const Iterator& index_iter, BinaryFunction fun)
__host__ __device__ tabulate_output_iterator_proxy(const Iterator& index_iter, BinaryFunction fun)
: index_iter(index_iter), fun(fun)
{
}
template <typename T>
__host__ __device__ output_writer_iterator_proxy operator=(const T& x)
__host__ __device__ tabulate_output_iterator_proxy operator=(const T& x)
{
fun(*index_iter, x);
return *this;
Expand All @@ -42,19 +42,19 @@ class output_writer_iterator_proxy {
BinaryFunction fun;
};

// Register output_writer_iterator_proxy with 'is_proxy_reference' from
// Register tabulate_output_iterator_proxy with 'is_proxy_reference' from
// type_traits to enable its use with algorithms.
template <class BinaryFunction, class Iterator>
struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>>
struct is_proxy_reference<tabulate_output_iterator_proxy<BinaryFunction, Iterator>>
: public thrust::detail::true_type {};

} // namespace detail

/**
* @brief Transform output iterator with custom writer binary function which takes index and value.
* @brief Transform output iterator with custom binary function which takes index and value.
*
* @code {.cpp}
* #include <thrust/iterator/output_writer_iterator.cuh>
* #include <thrust/iterator/tabulate_output_iterator.cuh>
* #include <thrust/device_vector.h>
* #include <thrust/iterator/counting_iterator.h>
* #include <thrust/iterator/transform_iterator.h>
Expand All @@ -80,7 +80,7 @@ struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>
* };
*
* thrust::device_vector<int> v(1, 0x00000000);
* auto result_begin = thrust::make_output_writer_iterator(thrust::make_counting_iterator(0),
* auto result_begin = thrust::make_tabulate_output_iterator(thrust::make_counting_iterator(0),
* set_bits_field{v.data().get()});
* auto value = thrust::make_transform_iterator(thrust::make_counting_iterator(0),
* [] __device__ (int x) { return x%2; });
Expand All @@ -94,27 +94,27 @@ struct is_proxy_reference<output_writer_iterator_proxy<BinaryFunction, Iterator>
* @tparam Iterator iterator type that acts as index of the output.
*/
template <typename BinaryFunction, typename Iterator>
class output_writer_iterator
class tabulate_output_iterator
: public thrust::iterator_adaptor<
output_writer_iterator<BinaryFunction, Iterator>,
tabulate_output_iterator<BinaryFunction, Iterator>,
Iterator,
thrust::use_default,
thrust::use_default,
thrust::use_default,
thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>> {
thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>> {
public:
// parent class.
typedef thrust::iterator_adaptor<
output_writer_iterator<BinaryFunction, Iterator>,
tabulate_output_iterator<BinaryFunction, Iterator>,
Iterator,
thrust::use_default,
thrust::use_default,
thrust::use_default,
thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>>
thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>>
super_t;
// friend thrust::iterator_core_access to allow it access to the private interface dereference()
friend class thrust::iterator_core_access;
__host__ __device__ output_writer_iterator(Iterator const& x, BinaryFunction fun)
__host__ __device__ tabulate_output_iterator(Iterator const& x, BinaryFunction fun)
: super_t(x), fun(fun)
{
}
Expand All @@ -125,15 +125,15 @@ class output_writer_iterator
// thrust::iterator_core_access accesses this function
__host__ __device__ typename super_t::reference dereference() const
{
return thrust::detail::output_writer_iterator_proxy<BinaryFunction, Iterator>(
return thrust::detail::tabulate_output_iterator_proxy<BinaryFunction, Iterator>(
this->base_reference(), fun);
}
};

template <typename BinaryFunction, typename Iterator>
output_writer_iterator<BinaryFunction, Iterator> __host__ __device__
make_output_writer_iterator(Iterator out, BinaryFunction fun)
tabulate_output_iterator<BinaryFunction, Iterator> __host__ __device__
make_tabulate_output_iterator(Iterator out, BinaryFunction fun)
{
return output_writer_iterator<BinaryFunction, Iterator>(out, fun);
} // end make_output_writer_iterator
return tabulate_output_iterator<BinaryFunction, Iterator>(out, fun);
} // end make_tabulate_output_iterator
THRUST_NAMESPACE_END