clulab · kwalcock · Jan 12, 2023 · Nov 30, 2022 · Nov 30, 2022 · Nov 30, 2022
diff --git a/main/src/main/scala/org/clulab/processors/Document.scala b/main/src/main/scala/org/clulab/processors/Document.scala
@@ -184,7 +184,36 @@ class Document(val sentences: Array[Sentence]) extends Serializable {
         }
       }
     })
+  }
+
+  protected def replaceSentences(sentences: Array[Sentence]): Document = {
+    val newDocument = new Document(sentences)
+
+    newDocument.id = id
+    newDocument.text = text
+
+    require(newDocument.coreferenceChains.isEmpty)
+    require(coreferenceChains.isEmpty)
+
+    getAttachmentKeys.foreach { attachmentKey =>
+      require(newDocument.getAttachment(attachmentKey).forall(_ == getAttachment(attachmentKey).get))
+      newDocument.addAttachment(attachmentKey, getAttachment(attachmentKey).get)
+    }
 
+    val dctOpt = getDCT
+    dctOpt.foreach(newDocument.setDCT)
+
+    newDocument
+  }
+
+  def offset(offset: Int): Document = {
+    if (offset == 0) this
+    else {
+      val offsetSentences = sentences.map(_.offset(offset))
+      val newDocument = replaceSentences(offsetSentences)
+
+      newDocument
+    }
   }
 }
 

diff --git a/main/src/main/scala/org/clulab/processors/Processor.scala b/main/src/main/scala/org/clulab/processors/Processor.scala
@@ -12,6 +12,69 @@ trait Processor {
   /** Constructs a document of tokens from free text; includes sentence splitting and tokenization. */
   def mkDocument (text:String, keepText:Boolean = false): Document
 
+  // The documents here were created with Processor.mkDocument, which could have created a subclassed
+  // Document or documents with certain fields already filled in.  This implementation only handles
+  // known document fields and then only performs rudimentary requirement checks to make sure that
+  // the documents are compatible for combination.  In more complicated situations it would be necessary
+  // to override this method in the Processor subclass.
+  protected def combineDocuments(documents: IndexedSeq[Document], combinedTextOpt: Option[String]): Document = {
+    require(documents.length > 1)
+    val headDocument = documents.head
+    val tailDocuments = documents.tail
+    val combinedSentences = documents.flatMap(_.sentences).toArray
+    val combinedDocument = new Document(combinedSentences)
+
+    val headId = headDocument.id
+    require(tailDocuments.forall(_.id == headId))
+    combinedDocument.id = headId
+
+    require(combinedDocument.text.isEmpty)
+    combinedDocument.text = combinedTextOpt
+
+    // Coreference chains involve Mentions that include references to documents.  The Mentions are being
+    // moved to a new Document and it would be infeasible to move the chains.
+    require(combinedDocument.coreferenceChains.isEmpty)
+    require(documents.forall(_.coreferenceChains.isEmpty))
+
+    documents.foreach { document =>
+      document.getAttachmentKeys.foreach { attachmentKey =>
+        require(combinedDocument.getAttachment(attachmentKey).forall(_ == document.getAttachment(attachmentKey).get))
+        combinedDocument.addAttachment(attachmentKey, document.getAttachment(attachmentKey).get)
+      }
+    }
+
+    val headDctOpt = headDocument.getDCT
+    require(documents.tail.forall(_.getDCT == headDctOpt))
+    headDctOpt.foreach(combinedDocument.setDCT)
+    combinedDocument
+  }
+
+  def mkCombinedDocument(texts: IndexedSeq[String], trailers: IndexedSeq[String], keepText: Boolean = false): Document = {
+    require(texts.length == trailers.length)
+    texts.length match {
+      case 0 => mkDocument("", keepText)
+      case 1 => mkDocument(texts.head, keepText)
+      case _ =>
+        val documents = texts.map(mkDocument(_, keepText))
+        val offsets = texts.zip(trailers).scanLeft(0) { case (offset, (text, trailer)) => offset + text.length + trailer.length }
+        val offsetDocuments = documents.zip(offsets).map { case (document, offset) =>
+          document.offset(offset)
+        }
+        val combinedTextOpt =
+            if (keepText) {
+              val combinedText = texts.zip(trailers).foldLeft(new StringBuilder) { case (stringBuilder, (text, separator)) =>
+                stringBuilder.append(text).append(separator)
+              }.toString
+
+              Some(combinedText)
+            }
+            else None
+        val combinedDocument = combineDocuments(offsetDocuments, combinedTextOpt)
+
+        combinedDocument
+    }
+  }
+
   /** Constructs a document of tokens from an array of untokenized sentences. */
   def mkDocumentFromSentences (sentences:Iterable[String],
                                keepText:Boolean = false,

diff --git a/main/src/main/scala/org/clulab/processors/Sentence.scala b/main/src/main/scala/org/clulab/processors/Sentence.scala
@@ -5,7 +5,6 @@ import org.clulab.struct.{DirectedGraph, GraphMap, RelationTriple, Tree}
 import org.clulab.struct.GraphMap._
 import org.clulab.utils.SeqUtils
 
-import scala.collection.immutable.Range
 import scala.collection.mutable
 import scala.util.hashing.MurmurHash3._
 
@@ -174,6 +173,25 @@ class Sentence(
 
     reverted
   }
+
+  def offset(offset: Int): Sentence = {
+    if (offset == 0) this
+    else {
+      val newStartOffsets = startOffsets.map(_ + offset).toArray
+      val newEndOffsets = endOffsets.map(_ + offset).toArray
+      val newSentence = Sentence(raw, newStartOffsets, newEndOffsets, words)
+
+      newSentence.tags = tags
+      newSentence.lemmas = lemmas
+      newSentence.entities = entities
+      newSentence.norms = norms
+      newSentence.chunks = chunks
+      newSentence.syntacticTree = syntacticTree
+      newSentence.graphs = graphs
+      newSentence.relations = relations
+      newSentence
+    }
+  }
 }
 
 object Sentence {

diff --git a/main/src/test/resources/org/clulab/processors/sentences10.txt b/main/src/test/resources/org/clulab/processors/sentences10.txt
@@ -0,0 +1,10 @@
+Needed lines of action will be decided on by representatives of some 50 nations .
+Scarcity , not only of foodstuffs but of lumber and other forest products , textiles , seeds , fertilizers , draught power , and farm equipment will continue throughout most of Europe and Asia during the coming year .
+Hopes of continued recovery in Europe 's indigenous food supplies were checked by last winter 's bad weather .
+Diets in Western and Central Europe will be still lower next year , and in Asia they will remain at present very low levels , unless imports can be increased .
+Even to hold the present line will require drastic action .
+Minimum import needs for Europe , North Africa , and Asia in 1947/48 may be estimated at 34 to 38 million tons without allowing for any improvement in bread rations , any additional livestock feeding , or any increase in working reserves .
+Against this need , supplies of grain available for export from the surplus countries may be tentatively estimated at 30 to 34 million tons .
+Even with somewhat larger supplies of certain other foods particularly potatoes , sugar , and fats the situation will continue to be grim .
+Cessation of UNRRA activities and accumulated foreign exchange difficulties worsen the problem for nations in a weak bargaining position .
+Every delay in improving this situation further impairs the working ability of labour , slows up reconstruction , adds to the physical damage caused by prolonged undernourishment , and accelerates social unrest .
diff --git a/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala b/main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala
@@ -0,0 +1,130 @@
+package org.clulab.processors
+
+import org.clulab.processors.clu.CluProcessor
+import org.clulab.serialization.DocumentSerializer
+import org.clulab.utils.Closer.AutoCloser
+import org.clulab.utils.{Sourcer, Test}
+
+import java.io.{PrintWriter, StringWriter}
+
+class TestMkCombinedDocument extends Test {
+  val sentences = Sourcer.sourceFromFilename("./main/src/test/resources/org/clulab/processors/sentences10.txt").autoClose { source =>
+    source.getLines.toArray
+  }
+  val manySentenceLengths = Array(
+    Array(1, 9),
+    Array(9, 1),
+    Array(1, 1, 8),
+    Array(1, 8, 1),
+    Array(8, 1, 1),
+    Array(5, 5),
+    Array(2, 2, 2, 2, 2),
+    Array(1, 2, 3, 4),
+    Array(4, 3, 2, 1),
+    Array(0, 5, 0, 5)
+  )
+  val separator = "  "
+  val documentSerializer = new DocumentSerializer()
+  val processor = new CluProcessor()
+
+  def toString(document: Document): String = {
+    val stringWriter = new StringWriter()
+
+    new PrintWriter(stringWriter).autoClose { printWriter =>
+      documentSerializer.save(document, printWriter, keepText = true)
+    }
+    stringWriter.toString
+  }
+
+  behavior of "mkCombinedDocument"
+
+  def test(sentenceLengths: Array[Int], expectedResult: String): Unit = {
+    val label = sentenceLengths.mkString("[", ", ", "]")
+
+    it should s"combine $label" in {
+      val sentenceStarts = sentenceLengths.scanLeft(0) { case (start, split) => start + split }
+      assert(sentenceStarts.last == 10)
+      val sentenceGroups = sentenceStarts.zip(sentenceLengths).map { case (start, length) =>
+        sentences.slice(start, start + length).mkString(separator)
+      }
+      //
+      val trailers = sentenceGroups.zipWithIndex.map { case (sentenceGroup, index) =>
+        if (sentenceGroup.isEmpty || index == sentenceGroups.indices.last) ""
+        else separator
+      }
+      val document = processor.mkCombinedDocument(sentenceGroups, trailers, keepText = true)
+      val actualResult = toString(document)
+
+      actualResult should be(expectedResult)
+    }
+  }
+
+  {
+    val document = processor.mkDocument(sentences.mkString(separator), keepText = true)
+    val expectedResult = toString(document)
+
+    manySentenceLengths.foreach { sentenceLengths =>
+      test(sentenceLengths, expectedResult)
+    }
+  }
+
+  behavior of "dynamically separated texts"
+
+  it should "include separators in both text and words" in {
+    val text = "I found this text<br>on a web page."
+    val separator = "<br>"
+    val texts = text.split(separator)
+    val dirtyTexts = texts.zipWithIndex.map { case (text, index) =>
+      if (index != texts.indices.last) text + separator
+      else text
+    }
+    val indices = texts.indices
+    val trailers = indices.map { _ => "" }
+    val document = processor.mkCombinedDocument(dirtyTexts, trailers, keepText = true)
+
+    document.text.get should be (text)
+    document.sentences.length should be (indices.length)
+
+    document.sentences.zipWithIndex.foreach { case (sentence, index) =>
+      if (index != indices.last)
+        sentence.words should contain (separator)
+      else
+        sentence.words should not contain (separator)
+    }
+  }
+
+  // This is thought to be the standard case.
+  it should "include separators in text but not words" in {
+    val text = "I found this text<br>on a web page."
+    val separator = "<br>"
+    val texts = text.split(separator)
+    val indices = texts.indices
+    val trailers = indices.map { index => if (index != indices.last) separator else "" }
+    val document = processor.mkCombinedDocument(texts, trailers, keepText = true)
+
+    document.text.get should be (text)
+    document.sentences.length should be (indices.length)
+
+    document.sentences.foreach { sentence =>
+      sentence.words should not contain(separator)
+    }
+  }
+
+  it should "include separators in neither text nor words" in {
+    val text = "I found this text<br>on a web page."
+    val separator = "<br>"
+    val cleanSeparator = "    "
+    val cleanText = text.replace(separator, cleanSeparator)
+    val texts = text.split(separator)
+    val indices = texts.indices
+    val trailers = indices.map { index => if (index != indices.last) cleanSeparator else "" }
+    val document = processor.mkCombinedDocument(texts, trailers, keepText = true)
+
+    document.text.get should be(cleanText)
+    document.sentences.length should be(indices.length)
+
+    document.sentences.foreach { sentence =>
+      sentence.words should not contain (separator)
+    }
+  }
+}