Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
/*******************************************************************************
/*

* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
Expand All @@ -15,7 +15,7 @@
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
******************************************************************************/
*/
package org.apache.drill.exec.expr.fn.impl;

import io.netty.buffer.DrillBuf;
Expand Down Expand Up @@ -144,41 +144,28 @@ public static int varTypesToInt(final int start, final int end, DrillBuf buffer)
return result;
}

// Assumes Alpha as [A-Za-z0-9]
// white space is treated as everything else.
/**
* Capitalizes first letter in each word.
* Any symbol except digits and letters is considered as word delimiter.
*
* @param start start position in input buffer
* @param end end position in input buffer
* @param inBuf buffer with input characters
* @param outBuf buffer with output characters
*/
public static void initCap(int start, int end, DrillBuf inBuf, DrillBuf outBuf) {
boolean capNext = true;
boolean capitalizeNext = true;
int out = 0;
for (int id = start; id < end; id++, out++) {
byte currentByte = inBuf.getByte(id);

// 'A - Z' : 0x41 - 0x5A
// 'a - z' : 0x61 - 0x7A
// '0-9' : 0x30 - 0x39
if (capNext) { // curCh is whitespace or first character of word.
if (currentByte >= 0x30 && currentByte <= 0x39) { // 0-9
capNext = false;
} else if (currentByte >= 0x41 && currentByte <= 0x5A) { // A-Z
capNext = false;
} else if (currentByte >= 0x61 && currentByte <= 0x7A) { // a-z
capNext = false;
currentByte -= 0x20; // Uppercase this character
}
// else {} whitespace
} else { // Inside of a word or white space after end of word.
if (currentByte >= 0x30 && currentByte <= 0x39) { // 0-9
// noop
} else if (currentByte >= 0x41 && currentByte <= 0x5A) { // A-Z
currentByte -= 0x20; // Lowercase this character
} else if (currentByte >= 0x61 && currentByte <= 0x7A) { // a-z
// noop
} else { // whitespace
capNext = true;
}
int currentByte = inBuf.getByte(id);
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This code works only for ASCII, but not for UTF-8. UTF-8 is a multi-byte code that requires special encoding/decoding to convert to Unicode characters. Without that encoding, this method won't work for Cyrillic, Greek or any other character set with upper/lower distinctions.

Since this method never worked, it is probably OK to make it a bit less broken than before: at least now it works for ASCII. Please add unit tests below, then file a JIRA, for the fact that this function does not work with UTF-8 despite the fact that Drill claims it supports UTF-8.

if (Character.isLetterOrDigit(currentByte)) {
currentByte = capitalizeNext ? Character.toUpperCase(currentByte) : Character.toLowerCase(currentByte);
capitalizeNext = false;
} else {
capitalizeNext = true;
}

outBuf.setByte(out, currentByte);
} // end of for_loop
}
}

/**
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -501,13 +501,7 @@ public void eval() {

for (int id = input.start; id < input.end; id++) {
byte currentByte = input.buffer.getByte(id);

// 'A - Z' : 0x41 - 0x5A
// 'a - z' : 0x61 - 0x7A
if (currentByte >= 0x41 && currentByte <= 0x5A) {
currentByte += 0x20;
}
out.buffer.setByte(id - input.start, currentByte) ;
out.buffer.setByte(id - input.start, Character.toLowerCase(currentByte)) ;
}
}
}
Expand All @@ -534,13 +528,7 @@ public void eval() {

for (int id = input.start; id < input.end; id++) {
byte currentByte = input.buffer.getByte(id);

// 'A - Z' : 0x41 - 0x5A
// 'a - z' : 0x61 - 0x7A
if (currentByte >= 0x61 && currentByte <= 0x7A) {
currentByte -= 0x20;
}
out.buffer.setByte(id - input.start, currentByte) ;
out.buffer.setByte(id - input.start, Character.toUpperCase(currentByte)) ;
}
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,17 +19,26 @@

import static org.junit.Assert.assertTrue;

import mockit.Mock;
import mockit.MockUp;
import mockit.integration.junit4.JMockit;
import org.apache.calcite.util.ConversionUtil;
import org.apache.calcite.util.Util;
import org.apache.commons.io.FileUtils;
import org.apache.drill.BaseTestQuery;
import org.apache.drill.exec.util.Text;
import org.junit.Ignore;
import org.junit.Test;

import com.google.common.collect.ImmutableList;
import org.junit.runner.RunWith;

import java.io.BufferedWriter;
import java.io.File;
import java.io.FileWriter;
import java.nio.charset.Charset;

@RunWith(JMockit.class)
public class TestStringFunctions extends BaseTestQuery {

@Test
Expand All @@ -38,14 +47,14 @@ public void testStrPosMultiByte() throws Exception {
.sqlQuery("select `position`('a', 'abc') res1 from (values(1))")
.ordered()
.baselineColumns("res1")
.baselineValues(1l)
.baselineValues(1L)
.go();

testBuilder()
.sqlQuery("select `position`('\\u11E9', '\\u11E9\\u0031') res1 from (values(1))")
.ordered()
.baselineColumns("res1")
.baselineValues(1l)
.baselineValues(1L)
.go();
}

Expand Down Expand Up @@ -308,4 +317,84 @@ public void testReverseLongVarChars() throws Exception {
FileUtils.deleteQuietly(path);
}
}

@Test
public void testLower() throws Exception {
testBuilder()
.sqlQuery("select\n" +
"lower('ABC') col_upper,\n" +
"lower('abc') col_lower,\n" +
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Please add tests for Greek and Cyrillic. Our source encoding is UTF-8, so you can enter the characters directly. Or, if that does not work, you can instead use the Java Unicode encoding: U1234.

If the tests fail because of parsing of SQL, please file a bug. If they fail because the function above does not support UTF-8, please file a different bug.

In either case, you can then comment out the test cases and add a comment that says that they fail due to DRILL-xxxx, whatever your bug number turns out to be.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Created Jira DRILL-5477 and added appropriate unit test which is ignored for now.

"lower('AbC aBc') col_space,\n" +
"lower('123ABC$!abc123.') as col_special,\n" +
"lower('') as col_empty,\n" +
"lower(cast(null as varchar(10))) as col_null\n" +
"from (values(1))")
.unOrdered()
.baselineColumns("col_upper", "col_lower", "col_space", "col_special", "col_empty", "col_null")
.baselineValues("abc", "abc", "abc abc", "123abc$!abc123.", "", null)
.build()
.run();
}

@Test
public void testUpper() throws Exception {
testBuilder()
.sqlQuery("select\n" +
"upper('ABC')as col_upper,\n" +
"upper('abc') as col_lower,\n" +
"upper('AbC aBc') as col_space,\n" +
"upper('123ABC$!abc123.') as col_special,\n" +
"upper('') as col_empty,\n" +
"upper(cast(null as varchar(10))) as col_null\n" +
"from (values(1))")
.unOrdered()
.baselineColumns("col_upper", "col_lower", "col_space", "col_special", "col_empty", "col_null")
.baselineValues("ABC", "ABC", "ABC ABC", "123ABC$!ABC123.", "", null)
.build()
.run();
}

@Test
public void testInitcap() throws Exception {
testBuilder()
.sqlQuery("select\n" +
"initcap('ABC')as col_upper,\n" +
"initcap('abc') as col_lower,\n" +
"initcap('AbC aBc') as col_space,\n" +
"initcap('123ABC$!abc123.') as col_special,\n" +
"initcap('') as col_empty,\n" +
"initcap(cast(null as varchar(10))) as col_null\n" +
"from (values(1))")
.unOrdered()
.baselineColumns("col_upper", "col_lower", "col_space", "col_special", "col_empty", "col_null")
.baselineValues("Abc", "Abc", "Abc Abc", "123abc$!Abc123.", "", null)
.build()
.run();
}

@Ignore("DRILL-5477")
@Test
public void testMultiByteEncoding() throws Exception {
// mock calcite util method to return utf charset
// instead of setting saffron.default.charset at system level
new MockUp<Util>()
{
@Mock
Charset getDefaultCharset() {
return Charset.forName(ConversionUtil.NATIVE_UTF16_CHARSET_NAME);
}
};

testBuilder()
.sqlQuery("select\n" +
"upper('привет')as col_upper,\n" +
"lower('ПРИВЕТ') as col_lower,\n" +
"initcap('приВЕТ') as col_initcap\n" +
"from (values(1))")
.unOrdered()
.baselineColumns("col_upper", "col_lower", "col_initcap")
.baselineValues("ПРИВЕТ", "привет", "Привет")
.build()
.run();
}
}