TIKA-3456 -- LanguageDetector should chunk long strings and test for hasEnoughText.

tballison · tballison · commit 4ba5fd7eb8b1 · 2021-06-28T12:15:33.000-04:00
diff --git a/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java b/tika-core/src/main/java/org/apache/tika/language/detect/LanguageDetector.java
@@ -50,14 +50,18 @@ public abstract class LanguageDetector {
 
 	private static final ServiceLoader DEFAULT_SERVICE_LOADER = new ServiceLoader();
 
+	//if a user calls detect on a huge string, break it into this size
+	//and add sequentially until hasEnoughText() is true
+	private static final int BUFFER_LENGTH = 4096;
+
 	// True if text is expected to be a mix of languages, and thus higher-resolution
 	// detection must be done to avoid under-sampling the text.
 	protected boolean mixedLanguages = false;
 	
 	// True if the text is expected to be 'short' (typically less than 100 chars), and
 	// thus a different algorithm and/or set of profiles should be used.
 	protected boolean shortText = false;
-	
+
 	public static LanguageDetector getDefaultLanguageDetector() {
 		List<LanguageDetector> detectors = getLanguageDetectors();
 		if (detectors.isEmpty()) {
@@ -183,8 +187,19 @@ public LanguageDetector setShortText(boolean shortText) {
 	 * @param text Characters to add to current statistics.
 	 */
 	public void addText(CharSequence text) {
-		char[] chars = text.toString().toCharArray();
-		addText(chars, 0, chars.length);
+		int len = text.length();
+		if (len < BUFFER_LENGTH) {
+			char[] chars = text.toString().toCharArray();
+			addText(chars, 0, chars.length);
+			return;
+		}
+		int start = 0;
+		while (! hasEnoughText() && start < len) {
+			int end = Math.min(start + BUFFER_LENGTH, len);
+			char[] chars = text.subSequence(start, end).toString().toCharArray();
+			addText(chars, 0, chars.length);
+			start += BUFFER_LENGTH;
+		}
 	}