Skip to content

Commit

Permalink
Merge pull request alibaba#3132 from gongxuanzhang/fix_utfdecode
Browse files Browse the repository at this point in the history
Fix utf decode
  • Loading branch information
zhuangjiaju authored Apr 28, 2023
2 parents 32aa7b4 + b835619 commit 770be56
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 3 deletions.
Original file line number Diff line number Diff line change
@@ -1,10 +1,12 @@
package com.alibaba.excel.analysis.v07.handlers.sax;

import com.alibaba.excel.cache.ReadCache;
import com.alibaba.excel.constant.ExcelXmlConstants;
import org.xml.sax.Attributes;
import org.xml.sax.helpers.DefaultHandler;

import com.alibaba.excel.cache.ReadCache;
import com.alibaba.excel.constant.ExcelXmlConstants;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
* Sax read sharedStringsTable.xml
Expand All @@ -13,6 +15,8 @@
*/
public class SharedStringsTableHandler extends DefaultHandler {

private static final Pattern UTF_PATTTERN = Pattern.compile("_x([0-9A-Fa-f]{4})_");

/**
* The final piece of data
*/
Expand Down Expand Up @@ -86,7 +90,7 @@ public void endElement(String uri, String localName, String name) {
if (currentData == null) {
readCache.put(null);
} else {
readCache.put(currentData.toString());
readCache.put(utfDecode(currentData.toString()));
}
break;
case ExcelXmlConstants.SHAREDSTRINGS_RPH_TAG:
Expand All @@ -109,4 +113,51 @@ public void characters(char[] ch, int start, int length) {
}
currentElementData.append(ch, start, length);
}

/**
* from poi XSSFRichTextString
*
* @param value the string to decode
* @return the decoded string or null if the input string is null
* <p>
* For all characters which cannot be represented in XML as defined by the XML 1.0 specification,
* the characters are escaped using the Unicode numerical character representation escape character
* format _xHHHH_, where H represents a hexadecimal character in the character's value.
* <p>
* Example: The Unicode character 0D is invalid in an XML 1.0 document,
* so it shall be escaped as <code>_x000D_</code>.
* </p>
* See section 3.18.9 in the OOXML spec.
* @see org.apache.poi.xssf.usermodel.XSSFRichTextString#utfDecode(String)
*/
static String utfDecode(String value) {
if (value == null || !value.contains("_x")) {
return value;
}

StringBuilder buf = new StringBuilder();
Matcher m = UTF_PATTTERN.matcher(value);
int idx = 0;
while (m.find()) {
int pos = m.start();
if (pos > idx) {
buf.append(value, idx, pos);
}

String code = m.group(1);
int icode = Integer.decode("0x" + code);
buf.append((char) icode);

idx = m.end();
}

// small optimization: don't go via StringBuilder if not necessary,
// the encodings are very rare, so we should almost always go via this shortcut.
if (idx == 0) {
return value;
}

buf.append(value.substring(idx));
return buf.toString();
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
package com.alibaba.easyexcel.test.demo.rare;

import com.alibaba.easyexcel.test.util.TestFileUtil;
import com.alibaba.excel.EasyExcel;
import org.apache.poi.xssf.usermodel.XSSFRow;
import org.apache.poi.xssf.usermodel.XSSFSheet;
import org.apache.poi.xssf.usermodel.XSSFWorkbook;
import org.junit.Assert;
import org.junit.Test;

import java.util.List;
import java.util.Map;

/**
* 记录一些不太常见的案例
*
* @author gxz gongxuanzhang@foxmail.com
**/
public class ReadTest {


/**
* 当excel有需要转义的 如x005特殊符号时需要通过utf decode解码
**/
@Test
public void readX005() throws Exception {
String fileName = TestFileUtil.pathBuild().sub("temp").sub("utfdecode").sub("demo.xlsx").getPath();
XSSFWorkbook xssfWorkbook = new XSSFWorkbook(fileName);
XSSFSheet xssfSheet = xssfWorkbook.getSheetAt(0);
XSSFRow row = xssfSheet.getRow(0);
String poiValue = row.getCell(0).getStringCellValue();
List<Map<Integer, Object>> list = EasyExcel.read(fileName)
.sheet(0)
.headRowNumber(0).doReadSync();
Map<Integer, Object> easyExcelRow = list.get(0);
Assert.assertEquals(easyExcelRow.get(0).toString(), poiValue);
}
}
Original file line number Diff line number Diff line change
@@ -1,7 +1,11 @@
package com.alibaba.easyexcel.test.util;

import org.springframework.util.CollectionUtils;

import java.io.File;
import java.io.InputStream;
import java.util.ArrayList;
import java.util.List;

public class TestFileUtil {

Expand All @@ -14,6 +18,10 @@ public static String getPath() {
return TestFileUtil.class.getResource("/").getPath();
}

public static TestPathBuild pathBuild() {
return new TestPathBuild();
}

public static File createNewFile(String pathName) {
File file = new File(getPath() + pathName);
if (file.exists()) {
Expand All @@ -33,4 +41,39 @@ public static File readFile(String pathName) {
public static File readUserHomeFile(String pathName) {
return new File(System.getProperty("user.home") + File.separator + pathName);
}


/**
* build to test file path
**/
public static class TestPathBuild {
private TestPathBuild() {
subPath = new ArrayList<>();
}

private final List<String> subPath;

public TestPathBuild sub(String dirOrFile) {
subPath.add(dirOrFile);
return this;
}

public String getPath() {
if (CollectionUtils.isEmpty(subPath)) {
return TestFileUtil.class.getResource("/").getPath();
}
if (subPath.size() == 1) {
return TestFileUtil.class.getResource("/").getPath() + subPath.get(0);
}
StringBuilder path = new StringBuilder(TestFileUtil.class.getResource("/").getPath());
path.append(subPath.get(0));
for (int i = 1; i < subPath.size(); i++) {
path.append(File.separator).append(subPath.get(i));
}
return path.toString();
}

}


}
Binary file not shown.

0 comments on commit 770be56

Please sign in to comment.