-
Notifications
You must be signed in to change notification settings - Fork 26
Add full support for HIGHCHARUNICODE
#397
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,6 +20,8 @@ | |
|
|
||
| import au.com.integradev.delphi.antlr.ast.visitors.DelphiParserVisitor; | ||
| import au.com.integradev.delphi.preprocessor.TextBlockLineEndingMode; | ||
| import java.nio.ByteBuffer; | ||
| import java.nio.charset.Charset; | ||
| import java.util.ArrayDeque; | ||
| import java.util.Deque; | ||
| import java.util.stream.Collectors; | ||
|
|
@@ -28,6 +30,7 @@ | |
| import org.apache.commons.lang3.Strings; | ||
| import org.sonar.plugins.communitydelphi.api.ast.DelphiNode; | ||
| import org.sonar.plugins.communitydelphi.api.ast.TextLiteralNode; | ||
| import org.sonar.plugins.communitydelphi.api.directive.SwitchDirective.SwitchKind; | ||
| import org.sonar.plugins.communitydelphi.api.token.DelphiTokenType; | ||
| import org.sonar.plugins.communitydelphi.api.type.IntrinsicType; | ||
| import org.sonar.plugins.communitydelphi.api.type.Type; | ||
|
|
@@ -167,26 +170,38 @@ private String createSingleLineValue() { | |
| return imageBuilder.toString(); | ||
| } | ||
|
|
||
| private static char characterEscapeToChar(String image) { | ||
| private boolean isHighCharUnicode() { | ||
| return getAst() | ||
| .getDelphiFile() | ||
| .getCompilerSwitchRegistry() | ||
| .isActiveSwitch(SwitchKind.HIGHCHARUNICODE, getTokenIndex()); | ||
| } | ||
|
|
||
| public Charset getAnsiCharset() { | ||
| return Charset.forName(System.getProperty("native.encoding")); | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Assuming that the compiler is actually using the configured codepage to interpret these escapes, we should:
|
||
| } | ||
|
|
||
| private char characterEscapeToChar(String image) { | ||
| image = image.substring(1); | ||
| int radix = 10; | ||
|
|
||
| switch (image.charAt(0)) { | ||
| case '$': | ||
| radix = 16; | ||
| image = image.substring(1); | ||
| break; | ||
| case '%': | ||
| radix = 2; | ||
| image = image.substring(1); | ||
| break; | ||
| default: | ||
| // do nothing | ||
| if (image.charAt(0) == '$') { | ||
| radix = 16; | ||
| image = image.substring(1); | ||
| } | ||
|
Comment on lines
+184
to
191
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's keep the handling for binary character escapes even though they're currently not syntactically valid. |
||
|
|
||
| image = StringUtils.remove(image, '_'); | ||
| char character = (char) Integer.parseInt(image, radix); | ||
|
|
||
| return (char) Integer.parseInt(image, radix); | ||
| if (isHighCharUnicode() || character > 255) { | ||
| // With HIGHCHARUNICODE ON, all escapes are interpreted as UTF-16. | ||
| // Escapes above 255 are always interpreted as UTF-16. | ||
| return character; | ||
| } else { | ||
| // With HIGHCHARUNICODE OFF, escapes between 0-255 are interpreted in the system code page. | ||
|
Comment on lines
+196
to
+201
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. According to the This wouldn't seem to matter, if you're only thinking about single-byte ANSI codepages that are supersets of ASCII.
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. With that being said, I've looked at the Shift_JIS character table and found that the first 127 characters are still ASCII. Even so, we should probably follow what the documentation says. |
||
| var buffer = ByteBuffer.allocate(1).put((byte) character).flip(); | ||
| return getAnsiCharset().decode(buffer).get(); | ||
cirras marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| } | ||
| } | ||
|
|
||
| @Override | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -20,17 +20,24 @@ | |
|
|
||
| import static org.assertj.core.api.Assertions.assertThat; | ||
| import static org.mockito.ArgumentMatchers.anyInt; | ||
| import static org.mockito.ArgumentMatchers.eq; | ||
| import static org.mockito.Mockito.mock; | ||
| import static org.mockito.Mockito.spy; | ||
| import static org.mockito.Mockito.when; | ||
|
|
||
| import au.com.integradev.delphi.antlr.DelphiLexer; | ||
| import au.com.integradev.delphi.antlr.ast.DelphiAstImpl; | ||
| import au.com.integradev.delphi.file.DelphiFile; | ||
| import au.com.integradev.delphi.preprocessor.CompilerSwitchRegistry; | ||
| import au.com.integradev.delphi.preprocessor.TextBlockLineEndingMode; | ||
| import au.com.integradev.delphi.preprocessor.TextBlockLineEndingModeRegistry; | ||
| import java.nio.charset.Charset; | ||
| import org.antlr.runtime.CommonToken; | ||
| import org.junit.jupiter.api.Test; | ||
| import org.junit.jupiter.params.ParameterizedTest; | ||
| import org.junit.jupiter.params.provider.ValueSource; | ||
| import org.sonar.plugins.communitydelphi.api.ast.DelphiNode; | ||
| import org.sonar.plugins.communitydelphi.api.directive.SwitchDirective.SwitchKind; | ||
|
|
||
| class TextLiteralNodeImplTest { | ||
| @Test | ||
|
|
@@ -59,22 +66,45 @@ void testMultilineImage() { | |
| assertThat(node.isMultiline()).isTrue(); | ||
| } | ||
|
|
||
| @Test | ||
| void testGetImageWithCharacterEscapes() { | ||
| TextLiteralNodeImpl node = new TextLiteralNodeImpl(DelphiLexer.TkTextLiteral); | ||
| @ParameterizedTest(name = "HIGHCHARUNICODE = {0}") | ||
| @ValueSource(booleans = {true, false}) | ||
| void testGetImageWithCharacterEscapes(boolean highCharUnicode) { | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The switch also affects the type of the expression - not just the way the string is interpreted. |
||
| var registry = mock(CompilerSwitchRegistry.class); | ||
| when(registry.isActiveSwitch(eq(SwitchKind.HIGHCHARUNICODE), anyInt())) | ||
| .thenReturn(highCharUnicode); | ||
| var file = mock(DelphiFile.class); | ||
| when(file.getCompilerSwitchRegistry()).thenReturn(registry); | ||
| var ast = mock(DelphiAstImpl.class); | ||
| when(ast.getDelphiFile()).thenReturn(file); | ||
|
|
||
| TextLiteralNodeImpl node = spy(new TextLiteralNodeImpl(DelphiLexer.TkTextLiteral)); | ||
| when(node.getAnsiCharset()).thenReturn(Charset.forName("windows-1252")); | ||
| node.setParent(ast); | ||
|
|
||
| node.addChild(createNode(DelphiLexer.TkQuotedString, "'F'")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#111")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#111")); | ||
| node.addChild(createNode(DelphiLexer.TkQuotedString, "'B'")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#$61")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#$72")); | ||
| node.addChild(createNode(DelphiLexer.TkQuotedString, "'B'")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#%01100001")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#%01111010")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#$80")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#$98")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#$A3")); | ||
| node.addChild(createNode(DelphiLexer.TkCharacterEscapeCode, "#$20AC")); | ||
| node.addChild(createNode(DelphiLexer.TkQuotedString, "'az'")); | ||
|
|
||
| assertThat(node.getImage()).isEqualTo("'F'#111#111'B'#$61#$72'B'#%01100001#%01111010"); | ||
| assertThat(node.getValue()).isEqualTo(node.getImageWithoutQuotes()).isEqualTo("FooBarBaz"); | ||
| assertThat(node.isMultiline()).isFalse(); | ||
| assertThat(node.getImage()).isEqualTo("'F'#111#111'B'#$61#$72'B'#$80#$98#$A3#$20AC'az'"); | ||
| if (highCharUnicode) { | ||
| assertThat(node.getValue()) | ||
| .isEqualTo(node.getImageWithoutQuotes()) | ||
| .isEqualTo("FooBarB\u0080\u0098£€az"); | ||
| } else { | ||
| assertThat(node.getValue()) | ||
| .isEqualTo(node.getImageWithoutQuotes()) | ||
| .isEqualTo("FooBarB€˜£€az"); | ||
| } | ||
| } | ||
|
|
||
| @Test | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we should probably have a
CharsetUtilsor something in frontend.I'd also call these
nativeCharset.