Skip to content

Commit

Permalink
Lightning: add character set conversion configuration instructions (p…
Browse files Browse the repository at this point in the history
  • Loading branch information
Liuxiaozhen12 authored Nov 5, 2021
1 parent c30cb6c commit 8aab7c8
Showing 1 changed file with 16 additions and 2 deletions.
18 changes: 16 additions & 2 deletions tidb-lightning/tidb-lightning-configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -157,10 +157,24 @@ no-schema = false
# - auto: (default) automatically detects whether the schema is UTF-8 or
# GB-18030. An error is reported if the encoding is neither.
# - binary: do not try to decode the schema files
# Note that the *data* files are always parsed as binary regardless of
# schema encoding.
character-set = "auto"

# Specifies the character set of the source data file. Lightning converts the source file from the specified character set to UTF-8 encoding when importing.
# Currently, this configuration only specifies the character set of the CSV files with the following options supported:
# - utf8mb4: Indicates that the source data file uses UTF-8 encoding.
# - GB18030: Indicates that the source data file uses the GB-18030 encoding.
# - GBK: The source data file uses GBK encoding (GBK encoding is an extension of the GB-2312 character set, also known as Code Page 936).
# - binary: Indicates that Lightning does not convert the encoding (by default).
# If left blank, the default value "binary" is used, that is to say, Lightning does not convert the encoding.
# Note that Lightning does not predict about the character set of the source data file and only converts the source file and import the data based on this configuration.
# If the value of this configuration is not the same as the actual encoding of the source data file, a failed import, data loss or data disorder might appear.
data-character-set = "binary"
# Specifies the replacement character in case of incompatible characters during the character set conversion of the source data file.
# This configuration must not be duplicated with field separators, quote definers, and line breaks.
# The default value is "\uFFFD", which is the "error" Rune or Unicode replacement character in UTF-8 encoding.
# Changing the default value might result in potential degradation of parsing performance for the source data file.
data-invalid-char-replace = "\uFFFD"

# the input data in a "strict" format speeds up processing.
# "strict-format = true" requires that:
# in CSV, every value cannot contain literal new lines (U+000A and U+000D, or \r and \n) even
Expand Down

0 comments on commit 8aab7c8

Please sign in to comment.