Skip to content

Commit 4ba5fd7

Browse files
committed
TIKA-3456 -- LanguageDetector should chunk long strings and test for hasEnoughText.
1 parent 90c6ea4 commit 4ba5fd7

File tree

1 file changed

+18
-3
lines changed

1 file changed

+18
-3
lines changed

tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java

Lines changed: 18 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -50,14 +50,18 @@ public abstract class LanguageDetector {
5050

5151
private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader();
5252

53+
//if a user calls detect on a huge string, break it into this size
54+
//and add sequentially until hasEnoughText() is true
55+
private static final int BUFFER_LENGTH = 4096;
56+
5357
// True if text is expected to be a mix of languages, and thus higher-resolution
5458
// detection must be done to avoid under-sampling the text.
5559
protected boolean mixedLanguages = false;
5660

5761
// True if the text is expected to be 'short' (typically less than 100 chars), and
5862
// thus a different algorithm and/or set of profiles should be used.
5963
protected boolean shortText = false;
60-
64+
6165
public static LanguageDetector getDefaultLanguageDetector() {
6266
List<LanguageDetector> detectors = getLanguageDetectors();
6367
if (detectors.isEmpty()) {
@@ -183,8 +187,19 @@ public LanguageDetector setShortText(boolean shortText) {
183187
* @param text Characters to add to current statistics.
184188
*/
185189
public void addText(CharSequence text) {
186-
char[] chars = text.toString().toCharArray();
187-
addText(chars, 0, chars.length);
190+
int len = text.length();
191+
if (len < BUFFER_LENGTH) {
192+
char[] chars = text.toString().toCharArray();
193+
addText(chars, 0, chars.length);
194+
return;
195+
}
196+
int start = 0;
197+
while (! hasEnoughText() && start < len) {
198+
int end = Math.min(start + BUFFER_LENGTH, len);
199+
char[] chars = text.subSequence(start, end).toString().toCharArray();
200+
addText(chars, 0, chars.length);
201+
start += BUFFER_LENGTH;
202+
}
188203
}
189204

190205

0 commit comments

Comments
 (0)