From 00ba281f9d54bdb50e67cb07c136a86227f0e8ab Mon Sep 17 00:00:00 2001 From: Emmanuel Keller Date: Tue, 1 Jul 2014 14:47:18 +0200 Subject: [PATCH] Fix #739. HTMLCleaner does not preserve the lang attribute on DOM serialization. --- .../searchlib/parser/htmlParser/HtmlCleanerParser.java | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/main/java/com/jaeksoft/searchlib/parser/htmlParser/HtmlCleanerParser.java b/src/main/java/com/jaeksoft/searchlib/parser/htmlParser/HtmlCleanerParser.java index 2afacd09e..cecd47e19 100644 --- a/src/main/java/com/jaeksoft/searchlib/parser/htmlParser/HtmlCleanerParser.java +++ b/src/main/java/com/jaeksoft/searchlib/parser/htmlParser/HtmlCleanerParser.java @@ -80,6 +80,9 @@ protected HtmlNodeAbstract getDocument(String pageSource) private DomHtmlNode getDomHtmlNode() throws ParserConfigurationException { Document document = new DomSerializer(cleaner.getProperties(), true) .createDOM(rootTagNode); + String lang = rootTagNode.getAttributeByName("lang"); + if (lang != null) + document.getDocumentElement().setAttribute("lang", lang); return new DomHtmlNode(document); }