Fix diffWords crashing when used with an Intl.Segmenter on a text with consecutive newlines (#631)

ExplodingCabbage · web-flow · commit da071fef795a · 2025-10-06T12:17:29.000+01:00
* Add test for broken case reported in #630 * Fix the bug * Add release notes
diff --git a/release-notes.md b/release-notes.md
@@ -1,5 +1,9 @@
 # Release Notes
 
+## Future 8.0.3 release
+
+- [#631](https://github.com/kpdecker/jsdiff/pull/631) - **fix support for using an `Intl.Segmenter` with `diffWords`**. This has been almost completely broken since the feature was added in v6.0.0, since it would outright crash on any text that featured two consecutive newlines between a pair of words (a very common case).
+
 ## 8.0.2
 
 - [#616](https://github.com/kpdecker/jsdiff/pull/616) **Restored compatibility of `diffSentences` with old Safari versions.** This was broken in 8.0.0 by the introduction of a regex with a [lookbehind assertion](https://developer.mozilla.org/en-US/docs/Web/JavaScript/Reference/Regular_expressions/Lookbehind_assertion); these weren't supported in Safari prior to version 16.4.
diff --git a/src/diff/word.ts b/src/diff/word.ts
@@ -67,7 +67,24 @@ class WordDiff extends Diff<string, string> {
       if (segmenter.resolvedOptions().granularity != 'word') {
         throw new Error('The segmenter passed must have a granularity of "word"');
       }
-      parts = Array.from(segmenter.segment(value), segment => segment.segment);
+      // We want `parts` to be an array whose elements alternate between being
+      // pure whitespace and being pure non-whitespace. This is ALMOST what the
+      // segments returned by a word-based Intl.Segmenter already look like,
+      // and therefore we can ALMOST get what we want by simply doing...
+      //     parts = Array.from(segmenter.segment(value), segment => segment.segment);
+      // ... but not QUITE, because there's of one annoying special case: every
+      // newline character gets its own segment, instead of sharing a segment
+      // with other surrounding whitespace. We therefore need to manually merge
+      // consecutive segments of whitespace into a single part:
+      parts = [];
+      for (const segmentObj of Array.from(segmenter.segment(value))) {
+        const segment = segmentObj.segment;
+        if (parts.length && (/\s/).test(parts[parts.length - 1]) && (/\s/).test(segment)) {
+          parts[parts.length - 1] += segment;
+        } else {
+          parts.push(segment);
+        }
+      }
     } else {
       parts = value.match(tokenizeIncludingWhitespace) || [];
     }
diff --git a/test/diff/word.js b/test/diff/word.js
@@ -284,6 +284,18 @@ describe('WordDiff', function() {
         diffWords('foo', 'bar', {intlSegmenter: segmenter});
       }).to['throw']('The segmenter passed must have a granularity of "word"');
     });
+
+    it("doesn't blow up when using an Intl.Segmenter on a text with a double newline", () => {
+      // Regression test for https://github.com/kpdecker/jsdiff/issues/630
+      const englishSegmenter = new Intl.Segmenter('en', {granularity: 'word'});
+      expect(convertChangesToXML(diffWords(
+        'A\n\nX',
+        'B\n\nX',
+        {intlSegmenter: englishSegmenter}
+      ))).to.equal(
+        '<del>A</del><ins>B</ins>\n\nX'
+      );
+    });
   });
 
   describe('#diffWordsWithSpace', function() {