Skip to content

Commit b7c0e4e

Browse files
authored
Merge 5561261 into 12b269e
2 parents 12b269e + 5561261 commit b7c0e4e

File tree

4 files changed

+581
-22
lines changed

4 files changed

+581
-22
lines changed

ydb/apps/ydb/CHANGELOG.md

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
* Added CREATE TABLE text suggestion on scheme error during `ydb import file csv`
12

23
## 2.18.0 ##
34

ydb/public/lib/ydb_cli/common/csv_parser.cpp

Lines changed: 276 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,8 @@
55

66
#include <library/cpp/string_utils/csv/csv.h>
77

8+
#include <regex>
9+
810
namespace NYdb {
911
namespace NConsoleClient {
1012
namespace {
@@ -289,6 +291,120 @@ class TCsvToYdbConverter {
289291
}
290292
}
291293

294+
template <class T>
295+
bool TryParseArithmetic(const TString& token) const {
296+
size_t cnt;
297+
try {
298+
auto value = StringToArithmetic<T>(token, cnt);
299+
if (cnt != token.size() || value < std::numeric_limits<T>::lowest() || value > std::numeric_limits<T>::max()) {
300+
return false;
301+
}
302+
} catch (std::exception& e) {
303+
return false;
304+
}
305+
return true;
306+
}
307+
308+
bool TryParseBool(const TString& token) const {
309+
TString tokenLowerCase = to_lower(token);
310+
return tokenLowerCase == "true" || tokenLowerCase == "false";
311+
}
312+
313+
bool TryParsePrimitive(const TString& token) {
314+
switch (Parser.GetPrimitive()) {
315+
case EPrimitiveType::Uint8:
316+
return TryParseArithmetic<ui8>(token);
317+
case EPrimitiveType::Uint16:
318+
return TryParseArithmetic<ui16>(token);
319+
case EPrimitiveType::Uint32:
320+
return TryParseArithmetic<ui32>(token);
321+
case EPrimitiveType::Uint64:
322+
return TryParseArithmetic<ui64>(token);
323+
case EPrimitiveType::Int8:
324+
return TryParseArithmetic<i8>(token);
325+
case EPrimitiveType::Int16:
326+
return TryParseArithmetic<i16>(token);
327+
case EPrimitiveType::Int32:
328+
return TryParseArithmetic<i32>(token);
329+
case EPrimitiveType::Int64:
330+
return TryParseArithmetic<i64>(token);
331+
case EPrimitiveType::Bool:
332+
return TryParseBool(token);
333+
case EPrimitiveType::Json:
334+
return token.StartsWith('{') && token.EndsWith('}');
335+
break;
336+
case EPrimitiveType::JsonDocument:
337+
break;
338+
case EPrimitiveType::Yson:
339+
break;
340+
case EPrimitiveType::Uuid:
341+
static std::regex uuidRegexTemplate("[0-9a-f]{8}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{4}-[0-9a-f]{12}");
342+
return std::regex_match(token.c_str(), uuidRegexTemplate);
343+
case EPrimitiveType::Float:
344+
return TryParseArithmetic<float>(token);
345+
case EPrimitiveType::Double:
346+
return TryParseArithmetic<double>(token);
347+
case EPrimitiveType::DyNumber:
348+
break;
349+
case EPrimitiveType::Date: {
350+
TInstant date;
351+
return TInstant::TryParseIso8601(token, date) && token.length() <= 10;
352+
}
353+
case EPrimitiveType::Datetime: {
354+
TInstant datetime;
355+
return TInstant::TryParseIso8601(token, datetime) && token.length() <= 19;
356+
}
357+
case EPrimitiveType::Timestamp: {
358+
TInstant timestamp;
359+
return TInstant::TryParseIso8601(token, timestamp) || TryParseArithmetic<ui64>(token);
360+
}
361+
case EPrimitiveType::Interval:
362+
break;
363+
case EPrimitiveType::Date32: {
364+
TInstant date;
365+
return TInstant::TryParseIso8601(token, date) || TryParseArithmetic<i32>(token);
366+
}
367+
case EPrimitiveType::Datetime64: {
368+
TInstant date;
369+
return TInstant::TryParseIso8601(token, date) || TryParseArithmetic<i64>(token);
370+
}
371+
case EPrimitiveType::Timestamp64: {
372+
TInstant date;
373+
return TInstant::TryParseIso8601(token, date) || TryParseArithmetic<i64>(token);
374+
}
375+
case EPrimitiveType::Interval64:
376+
return TryParseArithmetic<i64>(token);
377+
case EPrimitiveType::TzDate:
378+
break;
379+
case EPrimitiveType::TzDatetime:
380+
break;
381+
case EPrimitiveType::TzTimestamp:
382+
break;
383+
default:
384+
throw TCsvParseException() << "Unsupported primitive type: " << Parser.GetPrimitive();
385+
}
386+
return false;
387+
}
388+
389+
bool TryParseValue(const TStringBuf& token, TPossibleType& possibleType) {
390+
if (NullValue && token == NullValue) {
391+
possibleType.SetHasNulls(true);
392+
return true;
393+
}
394+
possibleType.SetHasNonNulls(true);
395+
switch (Parser.GetKind()) {
396+
case TTypeParser::ETypeKind::Primitive: {
397+
return TryParsePrimitive(TString(token));
398+
}
399+
case TTypeParser::ETypeKind::Decimal: {
400+
break;
401+
}
402+
default:
403+
throw TCsvParseException() << "Unsupported type kind: " << Parser.GetKind();
404+
}
405+
return false;
406+
}
407+
292408
TValue Convert(const TStringBuf& token) {
293409
BuildValue(token);
294410
return Builder.Build();
@@ -330,6 +446,16 @@ TValue FieldToValue(TTypeParser& parser,
330446
}
331447
}
332448

449+
bool TryParse(TTypeParser& parser, const TStringBuf& token, const std::optional<TString>& nullValue, TPossibleType& possibleType) {
450+
try {
451+
TCsvToYdbConverter converter(parser, nullValue);
452+
return converter.TryParseValue(token, possibleType);
453+
} catch (std::exception& e) {
454+
Cerr << "UNEXPECTED EXCEPTION: " << e.what() << Endl;
455+
return false;
456+
}
457+
}
458+
333459
TStringBuf Consume(NCsvFormat::CsvSplitter& splitter,
334460
const TCsvParser::TParseMetadata& meta,
335461
const TString& columnName) {
@@ -489,6 +615,156 @@ void TCsvParser::BuildLineType() {
489615
ResultLineType = builder.Build();
490616
ResultListType = TTypeBuilder().List(ResultLineType.value()).Build();
491617
}
618+
namespace {
619+
static const std::vector<TType> availableTypes = {
620+
TTypeBuilder().Primitive(EPrimitiveType::Bool).Build(),
621+
TTypeBuilder().Primitive(EPrimitiveType::Uint64).Build(),
622+
TTypeBuilder().Primitive(EPrimitiveType::Int64).Build(),
623+
TTypeBuilder().Primitive(EPrimitiveType::Double).Build(),
624+
TTypeBuilder().Primitive(EPrimitiveType::Date).Build(),
625+
TTypeBuilder().Primitive(EPrimitiveType::Datetime).Build(),
626+
TTypeBuilder().Primitive(EPrimitiveType::Timestamp).Build(),
627+
TTypeBuilder().Primitive(EPrimitiveType::Json).Build(),
628+
629+
TTypeBuilder().Primitive(EPrimitiveType::Interval).Build(),
630+
TTypeBuilder().Primitive(EPrimitiveType::Date32).Build(),
631+
TTypeBuilder().Primitive(EPrimitiveType::Datetime64).Build(),
632+
TTypeBuilder().Primitive(EPrimitiveType::Timestamp64).Build(),
633+
TTypeBuilder().Primitive(EPrimitiveType::Interval64).Build(),
634+
};
635+
636+
static const auto availableTypesEnd = availableTypes.end();
637+
638+
} // namespace
639+
640+
TPossibleType::TPossibleType() {
641+
CurrentType = availableTypes.begin();
642+
}
643+
644+
TPossibleType::TPossibleType(std::vector<TType>::const_iterator currentType)
645+
: CurrentType(currentType)
646+
{
647+
}
648+
649+
void TPossibleType::SetIterator(const std::vector<TType>::const_iterator& newIterator) {
650+
CurrentType = newIterator;
651+
}
652+
653+
std::vector<TType>::const_iterator& TPossibleType::GetIterator() {
654+
return CurrentType;
655+
}
656+
657+
const std::vector<TType>::const_iterator& TPossibleType::GetAvailableTypesEnd() {
658+
return availableTypesEnd;
659+
}
660+
661+
void TPossibleType::SetHasNulls(bool hasNulls) {
662+
HasNulls = hasNulls;
663+
}
664+
665+
bool TPossibleType::GetHasNulls() const {
666+
return HasNulls;
667+
}
668+
669+
void TPossibleType::SetHasNonNulls(bool hasNonNulls) {
670+
HasNonNulls = hasNonNulls;
671+
}
672+
673+
bool TPossibleType::GetHasNonNulls() const {
674+
return HasNonNulls;
675+
}
676+
677+
TPossibleTypes::TPossibleTypes(size_t size) {
678+
ColumnPossibleTypes.resize(size);
679+
}
680+
681+
TPossibleTypes::TPossibleTypes(std::vector<TPossibleType>& currentColumnTypes)
682+
: ColumnPossibleTypes(currentColumnTypes)
683+
{
684+
}
685+
686+
// Pass this copy to a worker to parse his chunk of data with it to merge it later back into this main chunk
687+
TPossibleTypes TPossibleTypes::GetCopy() {
688+
std::shared_lock<std::shared_mutex> ReadLock(Lock);
689+
return TPossibleTypes(ColumnPossibleTypes);
690+
}
691+
692+
// Merge this main chunk with another chunk that parsed a CSV batch and maybe dismissed some types
693+
void TPossibleTypes::MergeWith(TPossibleTypes& newTypes) {
694+
auto newTypesVec = newTypes.GetColumnPossibleTypes();
695+
{
696+
std::shared_lock<std::shared_mutex> ReadLock(Lock);
697+
bool changed = false;
698+
for (size_t i = 0; i < ColumnPossibleTypes.size(); ++i) {
699+
auto& currentPossibleType = ColumnPossibleTypes[i];
700+
auto& newPossibleType = newTypesVec[i];
701+
auto& currentIt = currentPossibleType.GetIterator();
702+
const auto& newIt = newPossibleType.GetIterator();
703+
if (newIt > currentIt) {
704+
changed = true;
705+
break;
706+
}
707+
if (currentPossibleType.GetHasNulls() != newPossibleType.GetHasNulls()
708+
|| currentPossibleType.GetHasNonNulls() != newPossibleType.GetHasNonNulls()) {
709+
changed = true;
710+
break;
711+
}
712+
}
713+
if (!changed) {
714+
return;
715+
}
716+
}
717+
std::unique_lock<std::shared_mutex> WriteLock(Lock);
718+
for (size_t i = 0; i < ColumnPossibleTypes.size(); ++i) {
719+
auto& currentPossibleType = ColumnPossibleTypes[i];
720+
auto& newPossibleType = newTypesVec[i];
721+
const auto& newIt = newPossibleType.GetIterator();
722+
if (newIt > currentPossibleType.GetIterator()) {
723+
currentPossibleType.SetIterator(newIt);
724+
}
725+
if (newPossibleType.GetHasNulls()) {
726+
currentPossibleType.SetHasNulls(true);
727+
}
728+
if (newPossibleType.GetHasNonNulls()) {
729+
currentPossibleType.SetHasNonNulls(true);
730+
}
731+
}
732+
}
733+
734+
std::vector<TPossibleType>& TPossibleTypes::GetColumnPossibleTypes() {
735+
return ColumnPossibleTypes;
736+
}
737+
738+
void TCsvParser::ParseLineTypes(TString& line, TPossibleTypes& possibleTypes, const TParseMetadata& meta) {
739+
NCsvFormat::CsvSplitter splitter(line, Delimeter);
740+
auto headerIt = Header.cbegin();
741+
auto typesIt = possibleTypes.GetColumnPossibleTypes().begin();
742+
do {
743+
if (headerIt == Header.cend()) {
744+
throw FormatError(yexception() << "Header contains less fields than data. Header: \"" << HeaderRow << "\", data: \"" << line << "\"", meta);
745+
}
746+
TStringBuf token = Consume(splitter, meta, *headerIt);
747+
TPossibleType& possibleType = *typesIt;
748+
auto& typeIt = possibleType.GetIterator();
749+
while (typeIt != availableTypesEnd) {
750+
TTypeParser typeParser(*typeIt);
751+
if (TryParse(typeParser, token, NullValue, possibleType)) {
752+
break;
753+
}
754+
++typeIt;
755+
}
756+
++headerIt;
757+
++typesIt;
758+
} while (splitter.Step());
759+
760+
if (headerIt != Header.cend()) {
761+
throw FormatError(yexception() << "Header contains more fields than data. Header: \"" << HeaderRow << "\", data: \"" << line << "\"", meta);
762+
}
763+
}
764+
765+
const TVector<TString>& TCsvParser::GetHeader() {
766+
return Header;
767+
}
492768

493769
}
494770
}

ydb/public/lib/ydb_cli/common/csv_parser.h

Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,11 +4,44 @@
44

55
#include <library/cpp/string_utils/csv/csv.h>
66

7+
#include <shared_mutex>
8+
79
namespace NYdb {
810
namespace NConsoleClient {
911

1012
class TCsvParseException : public yexception {};
1113

14+
class TPossibleType {
15+
public:
16+
TPossibleType();
17+
TPossibleType(std::vector<TType>::const_iterator currentType);
18+
void SetIterator(const std::vector<TType>::const_iterator& newIterator);
19+
std::vector<TType>::const_iterator& GetIterator();
20+
static const std::vector<TType>::const_iterator& GetAvailableTypesEnd();
21+
void SetHasNulls(bool hasNulls);
22+
bool GetHasNulls() const;
23+
void SetHasNonNulls(bool hasNonNulls);
24+
bool GetHasNonNulls() const;
25+
private:
26+
std::vector<TType>::const_iterator CurrentType;
27+
bool HasNulls = false;
28+
bool HasNonNulls = false;
29+
};
30+
31+
class TPossibleTypes {
32+
public:
33+
TPossibleTypes(size_t size);
34+
TPossibleTypes(std::vector<TPossibleType>& currentColumnTypes);
35+
// Pass this copy to a worker to parse his chunk of data with it to merge it later back into this main chunk
36+
TPossibleTypes GetCopy();
37+
// Merge this main chunk with another chunk that parsed a CSV batch and maybe dismissed some types
38+
void MergeWith(TPossibleTypes& newTypes);
39+
std::vector<TPossibleType>& GetColumnPossibleTypes();
40+
private:
41+
std::vector<TPossibleType> ColumnPossibleTypes;
42+
std::shared_mutex Lock;
43+
};
44+
1245
class TCsvParser {
1346
public:
1447
struct TParseMetadata {
@@ -36,6 +69,8 @@ class TCsvParser {
3669
TValue BuildList(std::vector<TString>& lines, const TString& filename,
3770
std::optional<ui64> row = std::nullopt) const;
3871
void BuildLineType();
72+
const TVector<TString>& GetHeader();
73+
void ParseLineTypes(TString& line, TPossibleTypes& possibleTypes, const TParseMetadata& meta);
3974

4075
private:
4176
TVector<TString> Header;

0 commit comments

Comments
 (0)