Skip to content

Commit 0295d0d

Browse files
committed
This patch enables the fexec-charset option to control the execution charset of string literals. It sets the default internal charset, system charset, and execution charset for z/OS and UTF-8 for all other platforms.
1 parent 3192c7b commit 0295d0d

File tree

20 files changed

+377
-53
lines changed

20 files changed

+377
-53
lines changed

clang/docs/LanguageExtensions.rst

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -416,8 +416,7 @@ Builtin Macros
416416
``__clang_literal_encoding__``
417417
Defined to a narrow string literal that represents the current encoding of
418418
narrow string literals, e.g., ``"hello"``. This macro typically expands to
419-
"UTF-8" (but may change in the future if the
420-
``-fexec-charset="Encoding-Name"`` option is implemented.)
419+
the charset specified by -fexec-charset if specified, or the system charset.
421420

422421
``__clang_wide_literal_encoding__``
423422
Defined to a narrow string literal that represents the current encoding of

clang/include/clang/Basic/LangOptions.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -633,6 +633,9 @@ class LangOptions : public LangOptionsBase {
633633
bool AtomicFineGrainedMemory = false;
634634
bool AtomicIgnoreDenormalMode = false;
635635

636+
/// Name of the exec charset to convert the internal charset to.
637+
std::string ExecCharset;
638+
636639
LangOptions();
637640

638641
/// Set language defaults for the given input language and

clang/include/clang/Basic/TokenKinds.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -101,6 +101,13 @@ inline bool isLiteral(TokenKind K) {
101101
isStringLiteral(K) || K == tok::header_name || K == tok::binary_data;
102102
}
103103

104+
/// Return true if this is a utf literal kind.
105+
inline bool isUTFLiteral(TokenKind K) {
106+
return K == tok::utf8_char_constant || K == tok::utf8_string_literal ||
107+
K == tok::utf16_char_constant || K == tok::utf16_string_literal ||
108+
K == tok::utf32_char_constant || K == tok::utf32_string_literal;
109+
}
110+
104111
/// Return true if this is any of tok::annot_* kinds.
105112
bool isAnnotation(TokenKind K);
106113

clang/include/clang/Driver/Options.td

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7197,6 +7197,11 @@ let Visibility = [CC1Option, CC1AsOption, FC1Option] in {
71977197
def tune_cpu : Separate<["-"], "tune-cpu">,
71987198
HelpText<"Tune for a specific cpu type">,
71997199
MarshallingInfoString<TargetOpts<"TuneCPU">>;
7200+
def fexec_charset : Separate<["-"], "fexec-charset">, MetaVarName<"<charset>">,
7201+
HelpText<"Set the execution <charset> for string and character literals. "
7202+
"Supported character encodings include ISO8859-1, UTF-8, IBM-1047 "
7203+
"and those supported by the host icu or iconv library.">,
7204+
MarshallingInfoString<LangOpts<"ExecCharset">>;
72007205
def target_cpu : Separate<["-"], "target-cpu">,
72017206
HelpText<"Target a specific cpu type">,
72027207
MarshallingInfoString<TargetOpts<"CPU">>;
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
//===--- clang/Lex/LiteralConverter.h - Translator for Literals -*- C++ -*-===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#ifndef LLVM_CLANG_LEX_LITERALCONVERTER_H
10+
#define LLVM_CLANG_LEX_LITERALCONVERTER_H
11+
12+
#include "clang/Basic/Diagnostic.h"
13+
#include "clang/Basic/LangOptions.h"
14+
#include "clang/Basic/TargetInfo.h"
15+
#include "llvm/ADT/StringMap.h"
16+
#include "llvm/ADT/StringRef.h"
17+
#include "llvm/Support/CharSet.h"
18+
19+
enum ConversionAction { NoConversion, ToSystemCharset, ToExecCharset };
20+
21+
class LiteralConverter {
22+
llvm::StringRef InternalCharset;
23+
llvm::StringRef SystemCharset;
24+
llvm::StringRef ExecCharset;
25+
llvm::StringMap<llvm::CharSetConverter> CharsetConverters;
26+
27+
public:
28+
llvm::CharSetConverter *getConverter(const char *Codepage);
29+
llvm::CharSetConverter *getConverter(ConversionAction Action);
30+
llvm::CharSetConverter *createAndInsertCharConverter(const char *To);
31+
void setConvertersFromOptions(const clang::LangOptions &Opts,
32+
const clang::TargetInfo &TInfo,
33+
clang::DiagnosticsEngine &Diags);
34+
};
35+
36+
#endif

clang/include/clang/Lex/LiteralSupport.h

Lines changed: 11 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -17,10 +17,12 @@
1717
#include "clang/Basic/CharInfo.h"
1818
#include "clang/Basic/LLVM.h"
1919
#include "clang/Basic/TokenKinds.h"
20+
#include "clang/Lex/LiteralConverter.h"
2021
#include "llvm/ADT/APFloat.h"
2122
#include "llvm/ADT/ArrayRef.h"
2223
#include "llvm/ADT/SmallString.h"
2324
#include "llvm/ADT/StringRef.h"
25+
#include "llvm/Support/CharSet.h"
2426
#include "llvm/Support/DataTypes.h"
2527

2628
namespace clang {
@@ -233,6 +235,7 @@ class StringLiteralParser {
233235
const LangOptions &Features;
234236
const TargetInfo &Target;
235237
DiagnosticsEngine *Diags;
238+
LiteralConverter *LiteralConv;
236239

237240
unsigned MaxTokenLength;
238241
unsigned SizeBound;
@@ -246,18 +249,19 @@ class StringLiteralParser {
246249
StringLiteralEvalMethod EvalMethod;
247250

248251
public:
249-
StringLiteralParser(ArrayRef<Token> StringToks, Preprocessor &PP,
250-
StringLiteralEvalMethod StringMethod =
251-
StringLiteralEvalMethod::Evaluated);
252+
StringLiteralParser(
253+
ArrayRef<Token> StringToks, Preprocessor &PP,
254+
StringLiteralEvalMethod StringMethod = StringLiteralEvalMethod::Evaluated,
255+
ConversionAction Action = ToExecCharset);
252256
StringLiteralParser(ArrayRef<Token> StringToks, const SourceManager &sm,
253257
const LangOptions &features, const TargetInfo &target,
254258
DiagnosticsEngine *diags = nullptr)
255259
: SM(sm), Features(features), Target(target), Diags(diags),
256-
MaxTokenLength(0), SizeBound(0), CharByteWidth(0), Kind(tok::unknown),
257-
ResultPtr(ResultBuf.data()),
260+
LiteralConv(nullptr), MaxTokenLength(0), SizeBound(0), CharByteWidth(0),
261+
Kind(tok::unknown), ResultPtr(ResultBuf.data()),
258262
EvalMethod(StringLiteralEvalMethod::Evaluated), hadError(false),
259263
Pascal(false) {
260-
init(StringToks);
264+
init(StringToks, NoConversion);
261265
}
262266

263267
bool hadError;
@@ -305,7 +309,7 @@ class StringLiteralParser {
305309
static bool isValidUDSuffix(const LangOptions &LangOpts, StringRef Suffix);
306310

307311
private:
308-
void init(ArrayRef<Token> StringToks);
312+
void init(ArrayRef<Token> StringToks, ConversionAction Action);
309313
bool CopyStringFragment(const Token &Tok, const char *TokBegin,
310314
StringRef Fragment);
311315
void DiagnoseLexingError(SourceLocation Loc);

clang/include/clang/Lex/Preprocessor.h

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@
2525
#include "clang/Basic/TokenKinds.h"
2626
#include "clang/Lex/HeaderSearch.h"
2727
#include "clang/Lex/Lexer.h"
28+
#include "clang/Lex/LiteralConverter.h"
2829
#include "clang/Lex/MacroInfo.h"
2930
#include "clang/Lex/ModuleLoader.h"
3031
#include "clang/Lex/ModuleMap.h"
@@ -156,6 +157,7 @@ class Preprocessor {
156157
std::unique_ptr<ScratchBuffer> ScratchBuf;
157158
HeaderSearch &HeaderInfo;
158159
ModuleLoader &TheModuleLoader;
160+
LiteralConverter LiteralConv;
159161

160162
/// External source of macros.
161163
ExternalPreprocessorSource *ExternalSource;
@@ -1218,6 +1220,7 @@ class Preprocessor {
12181220
SelectorTable &getSelectorTable() { return Selectors; }
12191221
Builtin::Context &getBuiltinInfo() { return *BuiltinInfo; }
12201222
llvm::BumpPtrAllocator &getPreprocessorAllocator() { return BP; }
1223+
LiteralConverter &getLiteralConverter() { return LiteralConv; }
12211224

12221225
void setExternalSource(ExternalPreprocessorSource *Source) {
12231226
ExternalSource = Source;

clang/lib/Driver/ToolChains/Clang.cpp

Lines changed: 13 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -50,6 +50,7 @@
5050
#include "llvm/Frontend/Debug/Options.h"
5151
#include "llvm/Object/ObjectFile.h"
5252
#include "llvm/Option/ArgList.h"
53+
#include "llvm/Support/CharSet.h"
5354
#include "llvm/Support/CodeGen.h"
5455
#include "llvm/Support/Compiler.h"
5556
#include "llvm/Support/Compression.h"
@@ -7597,12 +7598,20 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
75977598
<< value;
75987599
}
75997600

7600-
// -fexec_charset=UTF-8 is default. Reject others
7601+
// Set the default fexec-charset as the system charset.
7602+
CmdArgs.push_back("-fexec-charset");
7603+
CmdArgs.push_back(Args.MakeArgString(Triple.getSystemCharset()));
76017604
if (Arg *execCharset = Args.getLastArg(options::OPT_fexec_charset_EQ)) {
76027605
StringRef value = execCharset->getValue();
7603-
if (!value.equals_insensitive("utf-8"))
7604-
D.Diag(diag::err_drv_invalid_value) << execCharset->getAsString(Args)
7605-
<< value;
7606+
llvm::ErrorOr<llvm::CharSetConverter> ErrorOrConverter =
7607+
llvm::CharSetConverter::create("UTF-8", value.data());
7608+
if (ErrorOrConverter) {
7609+
CmdArgs.push_back("-fexec-charset");
7610+
CmdArgs.push_back(Args.MakeArgString(value));
7611+
} else {
7612+
D.Diag(diag::err_drv_invalid_value)
7613+
<< execCharset->getAsString(Args) << value;
7614+
}
76067615
}
76077616

76087617
RenderDiagnosticsOptions(D, Args, CmdArgs);

clang/lib/Frontend/CompilerInstance.cpp

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,6 +32,7 @@
3232
#include "clang/Frontend/Utils.h"
3333
#include "clang/Frontend/VerifyDiagnosticConsumer.h"
3434
#include "clang/Lex/HeaderSearch.h"
35+
#include "clang/Lex/LiteralConverter.h"
3536
#include "clang/Lex/Preprocessor.h"
3637
#include "clang/Lex/PreprocessorOptions.h"
3738
#include "clang/Sema/CodeCompleteConsumer.h"
@@ -537,6 +538,9 @@ void CompilerInstance::createPreprocessor(TranslationUnitKind TUKind) {
537538

538539
if (GetDependencyDirectives)
539540
PP->setDependencyDirectivesGetter(*GetDependencyDirectives);
541+
542+
PP->getLiteralConverter().setConvertersFromOptions(getLangOpts(), getTarget(),
543+
getDiagnostics());
540544
}
541545

542546
std::string CompilerInstance::getSpecificModuleCachePath(StringRef ModuleHash) {

clang/lib/Frontend/InitPreprocessor.cpp

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1058,10 +1058,14 @@ static void InitializePredefinedMacros(const TargetInfo &TI,
10581058
}
10591059
}
10601060

1061-
// Macros to help identify the narrow and wide character sets
1062-
// FIXME: clang currently ignores -fexec-charset=. If this changes,
1063-
// then this may need to be updated.
1064-
Builder.defineMacro("__clang_literal_encoding__", "\"UTF-8\"");
1061+
// Macros to help identify the narrow and wide character sets. This is set
1062+
// to fexec-charset. If fexec-charset is not specified, the default is the
1063+
// system charset.
1064+
if (!LangOpts.ExecCharset.empty())
1065+
Builder.defineMacro("__clang_literal_encoding__", LangOpts.ExecCharset);
1066+
else
1067+
Builder.defineMacro("__clang_literal_encoding__",
1068+
TI.getTriple().getSystemCharset());
10651069
if (TI.getTypeWidth(TI.getWCharType()) >= 32) {
10661070
// FIXME: 32-bit wchar_t signals UTF-32. This may change
10671071
// if -fwide-exec-charset= is ever supported.

clang/lib/Lex/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ add_clang_library(clangLex
1212
InitHeaderSearch.cpp
1313
Lexer.cpp
1414
LexHLSLRootSignature.cpp
15+
LiteralConverter.cpp
1516
LiteralSupport.cpp
1617
MacroArgs.cpp
1718
MacroInfo.cpp

clang/lib/Lex/LiteralConverter.cpp

Lines changed: 68 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,68 @@
1+
//===--- LiteralConverter.cpp - Translator for String Literals -----------===//
2+
//
3+
// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4+
// See https://llvm.org/LICENSE.txt for license information.
5+
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6+
//
7+
//===----------------------------------------------------------------------===//
8+
9+
#include "clang/Lex/LiteralConverter.h"
10+
#include "clang/Basic/DiagnosticDriver.h"
11+
12+
using namespace llvm;
13+
14+
llvm::CharSetConverter *LiteralConverter::getConverter(const char *Codepage) {
15+
auto Iter = CharsetConverters.find(Codepage);
16+
if (Iter != CharsetConverters.end())
17+
return &Iter->second;
18+
return nullptr;
19+
}
20+
21+
llvm::CharSetConverter *
22+
LiteralConverter::getConverter(ConversionAction Action) {
23+
StringRef CodePage;
24+
if (Action == ToSystemCharset)
25+
CodePage = SystemCharset;
26+
else if (Action == ToExecCharset)
27+
CodePage = ExecCharset;
28+
else
29+
CodePage = InternalCharset;
30+
return getConverter(CodePage.data());
31+
}
32+
33+
llvm::CharSetConverter *
34+
LiteralConverter::createAndInsertCharConverter(const char *To) {
35+
const char *From = InternalCharset.data();
36+
llvm::CharSetConverter *Converter = getConverter(To);
37+
if (Converter)
38+
return Converter;
39+
40+
ErrorOr<CharSetConverter> ErrorOrConverter =
41+
llvm::CharSetConverter::create(From, To);
42+
if (!ErrorOrConverter)
43+
return nullptr;
44+
CharsetConverters.insert_or_assign(StringRef(To),
45+
std::move(*ErrorOrConverter));
46+
return getConverter(To);
47+
}
48+
49+
void LiteralConverter::setConvertersFromOptions(
50+
const clang::LangOptions &Opts, const clang::TargetInfo &TInfo,
51+
clang::DiagnosticsEngine &Diags) {
52+
using namespace llvm;
53+
SystemCharset = TInfo.getTriple().getSystemCharset();
54+
InternalCharset = "UTF-8";
55+
ExecCharset = Opts.ExecCharset.empty() ? InternalCharset : Opts.ExecCharset;
56+
// Create converter between internal and system charset
57+
if (!InternalCharset.equals(SystemCharset))
58+
createAndInsertCharConverter(SystemCharset.data());
59+
60+
// Create converter between internal and exec charset specified
61+
// in fexec-charset option.
62+
if (InternalCharset.equals(ExecCharset))
63+
return;
64+
if (!createAndInsertCharConverter(ExecCharset.data())) {
65+
Diags.Report(clang::diag::err_drv_invalid_value)
66+
<< "-fexec-charset" << ExecCharset;
67+
}
68+
}

0 commit comments

Comments
 (0)