Lightning: add character set conversion configuration instructions (p…

…ingcap#6714)
Frank945946 · Nov 5, 2021 · 8aab7c8 · 8aab7c8
1 parent c30cb6c
commit 8aab7c8
Showing 1 changed file with 16 additions and 2 deletions.
diff --git a/tidb-lightning/tidb-lightning-configuration.md b/tidb-lightning/tidb-lightning-configuration.md
@@ -157,10 +157,24 @@ no-schema = false
 #  - auto:    (default) automatically detects whether the schema is UTF-8 or
 #             GB-18030. An error is reported if the encoding is neither.
 #  - binary:  do not try to decode the schema files
-# Note that the *data* files are always parsed as binary regardless of
-# schema encoding.
 character-set = "auto"
 
+# Specifies the character set of the source data file. Lightning converts the source file from the specified character set to UTF-8 encoding when importing.
+# Currently, this configuration only specifies the character set of the CSV files with the following options supported:
+# - utf8mb4: Indicates that the source data file uses UTF-8 encoding.
+# - GB18030: Indicates that the source data file uses the GB-18030 encoding.
+# - GBK: The source data file uses GBK encoding (GBK encoding is an extension of the GB-2312 character set, also known as Code Page 936).
+# - binary: Indicates that Lightning does not convert the encoding (by default).
+# If left blank, the default value "binary" is used, that is to say, Lightning does not convert the encoding.
+# Note that Lightning does not predict about the character set of the source data file and only converts the source file and import the data based on this configuration.
+# If the value of this configuration is not the same as the actual encoding of the source data file, a failed import, data loss or data disorder might appear.
+data-character-set = "binary"
+# Specifies the replacement character in case of incompatible characters during the character set conversion of the source data file.
+# This configuration must not be duplicated with field separators, quote definers, and line breaks.
+# The default value is "\uFFFD", which is the "error" Rune or Unicode replacement character in UTF-8 encoding.
+# Changing the default value might result in potential degradation of parsing performance for the source data file.
+data-invalid-char-replace = "\uFFFD"
+
 # the input data in a "strict" format speeds up processing.
 # "strict-format = true" requires that:
 # in CSV, every value cannot contain literal new lines (U+000A and U+000D, or \r and \n) even