Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 29 additions & 0 deletions main/src/main/scala/org/clulab/processors/Document.scala
Original file line number Diff line number Diff line change
Expand Up @@ -184,7 +184,36 @@ class Document(val sentences: Array[Sentence]) extends Serializable {
}
}
})
}

protected def replaceSentences(sentences: Array[Sentence]): Document = {
val newDocument = new Document(sentences)

newDocument.id = id
newDocument.text = text

require(newDocument.coreferenceChains.isEmpty)
require(coreferenceChains.isEmpty)

getAttachmentKeys.foreach { attachmentKey =>
require(newDocument.getAttachment(attachmentKey).forall(_ == getAttachment(attachmentKey).get))
newDocument.addAttachment(attachmentKey, getAttachment(attachmentKey).get)
}

val dctOpt = getDCT
dctOpt.foreach(newDocument.setDCT)

newDocument
}

def offset(offset: Int): Document = {
if (offset == 0) this
else {
val offsetSentences = sentences.map(_.offset(offset))
val newDocument = replaceSentences(offsetSentences)

newDocument
}
}
}

Expand Down
63 changes: 63 additions & 0 deletions main/src/main/scala/org/clulab/processors/Processor.scala
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,69 @@ trait Processor {
/** Constructs a document of tokens from free text; includes sentence splitting and tokenization. */
def mkDocument (text:String, keepText:Boolean = false): Document

// The documents here were created with Processor.mkDocument, which could have created a subclassed
// Document or documents with certain fields already filled in. This implementation only handles
// known document fields and then only performs rudimentary requirement checks to make sure that
// the documents are compatible for combination. In more complicated situations it would be necessary
// to override this method in the Processor subclass.
protected def combineDocuments(documents: IndexedSeq[Document], combinedTextOpt: Option[String]): Document = {
require(documents.length > 1)
val headDocument = documents.head
val tailDocuments = documents.tail
val combinedSentences = documents.flatMap(_.sentences).toArray
val combinedDocument = new Document(combinedSentences)

val headId = headDocument.id
require(tailDocuments.forall(_.id == headId))
combinedDocument.id = headId

require(combinedDocument.text.isEmpty)
combinedDocument.text = combinedTextOpt

// Coreference chains involve Mentions that include references to documents. The Mentions are being
// moved to a new Document and it would be infeasible to move the chains.
require(combinedDocument.coreferenceChains.isEmpty)
require(documents.forall(_.coreferenceChains.isEmpty))

documents.foreach { document =>
document.getAttachmentKeys.foreach { attachmentKey =>
require(combinedDocument.getAttachment(attachmentKey).forall(_ == document.getAttachment(attachmentKey).get))
combinedDocument.addAttachment(attachmentKey, document.getAttachment(attachmentKey).get)
}
}

val headDctOpt = headDocument.getDCT
require(documents.tail.forall(_.getDCT == headDctOpt))
headDctOpt.foreach(combinedDocument.setDCT)
combinedDocument
}

def mkCombinedDocument(texts: IndexedSeq[String], trailers: IndexedSeq[String], keepText: Boolean = false): Document = {
require(texts.length == trailers.length)
texts.length match {
case 0 => mkDocument("", keepText)
case 1 => mkDocument(texts.head, keepText)
case _ =>
val documents = texts.map(mkDocument(_, keepText))
val offsets = texts.zip(trailers).scanLeft(0) { case (offset, (text, trailer)) => offset + text.length + trailer.length }
val offsetDocuments = documents.zip(offsets).map { case (document, offset) =>
document.offset(offset)
}
val combinedTextOpt =
if (keepText) {
val combinedText = texts.zip(trailers).foldLeft(new StringBuilder) { case (stringBuilder, (text, separator)) =>
stringBuilder.append(text).append(separator)
}.toString

Some(combinedText)
}
else None
val combinedDocument = combineDocuments(offsetDocuments, combinedTextOpt)

combinedDocument
}
}

/** Constructs a document of tokens from an array of untokenized sentences. */
def mkDocumentFromSentences (sentences:Iterable[String],
keepText:Boolean = false,
Expand Down
20 changes: 19 additions & 1 deletion main/src/main/scala/org/clulab/processors/Sentence.scala
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@ import org.clulab.struct.{DirectedGraph, GraphMap, RelationTriple, Tree}
import org.clulab.struct.GraphMap._
import org.clulab.utils.SeqUtils

import scala.collection.immutable.Range
import scala.collection.mutable
import scala.util.hashing.MurmurHash3._

Expand Down Expand Up @@ -174,6 +173,25 @@ class Sentence(

reverted
}

def offset(offset: Int): Sentence = {
if (offset == 0) this
else {
val newStartOffsets = startOffsets.map(_ + offset).toArray
val newEndOffsets = endOffsets.map(_ + offset).toArray
val newSentence = Sentence(raw, newStartOffsets, newEndOffsets, words)

newSentence.tags = tags
newSentence.lemmas = lemmas
newSentence.entities = entities
newSentence.norms = norms
newSentence.chunks = chunks
newSentence.syntacticTree = syntacticTree
newSentence.graphs = graphs
newSentence.relations = relations
newSentence
}
}
}

object Sentence {
Expand Down
10 changes: 10 additions & 0 deletions main/src/test/resources/org/clulab/processors/sentences10.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
Needed lines of action will be decided on by representatives of some 50 nations .
Scarcity , not only of foodstuffs but of lumber and other forest products , textiles , seeds , fertilizers , draught power , and farm equipment will continue throughout most of Europe and Asia during the coming year .
Hopes of continued recovery in Europe 's indigenous food supplies were checked by last winter 's bad weather .
Diets in Western and Central Europe will be still lower next year , and in Asia they will remain at present very low levels , unless imports can be increased .
Even to hold the present line will require drastic action .
Minimum import needs for Europe , North Africa , and Asia in 1947/48 may be estimated at 34 to 38 million tons without allowing for any improvement in bread rations , any additional livestock feeding , or any increase in working reserves .
Against this need , supplies of grain available for export from the surplus countries may be tentatively estimated at 30 to 34 million tons .
Even with somewhat larger supplies of certain other foods particularly potatoes , sugar , and fats the situation will continue to be grim .
Cessation of UNRRA activities and accumulated foreign exchange difficulties worsen the problem for nations in a weak bargaining position .
Every delay in improving this situation further impairs the working ability of labour , slows up reconstruction , adds to the physical damage caused by prolonged undernourishment , and accelerates social unrest .
130 changes: 130 additions & 0 deletions main/src/test/scala/org/clulab/processors/TestMkCombinedDocument.scala
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
package org.clulab.processors

import org.clulab.processors.clu.CluProcessor
import org.clulab.serialization.DocumentSerializer
import org.clulab.utils.Closer.AutoCloser
import org.clulab.utils.{Sourcer, Test}

import java.io.{PrintWriter, StringWriter}

class TestMkCombinedDocument extends Test {
val sentences = Sourcer.sourceFromFilename("./main/src/test/resources/org/clulab/processors/sentences10.txt").autoClose { source =>
source.getLines.toArray
}
val manySentenceLengths = Array(
Array(1, 9),
Array(9, 1),
Array(1, 1, 8),
Array(1, 8, 1),
Array(8, 1, 1),
Array(5, 5),
Array(2, 2, 2, 2, 2),
Array(1, 2, 3, 4),
Array(4, 3, 2, 1),
Array(0, 5, 0, 5)
)
val separator = " "
val documentSerializer = new DocumentSerializer()
val processor = new CluProcessor()

def toString(document: Document): String = {
val stringWriter = new StringWriter()

new PrintWriter(stringWriter).autoClose { printWriter =>
documentSerializer.save(document, printWriter, keepText = true)
}
stringWriter.toString
}

behavior of "mkCombinedDocument"

def test(sentenceLengths: Array[Int], expectedResult: String): Unit = {
val label = sentenceLengths.mkString("[", ", ", "]")

it should s"combine $label" in {
val sentenceStarts = sentenceLengths.scanLeft(0) { case (start, split) => start + split }
assert(sentenceStarts.last == 10)
val sentenceGroups = sentenceStarts.zip(sentenceLengths).map { case (start, length) =>
sentences.slice(start, start + length).mkString(separator)
}
//
val trailers = sentenceGroups.zipWithIndex.map { case (sentenceGroup, index) =>
if (sentenceGroup.isEmpty || index == sentenceGroups.indices.last) ""
else separator
}
val document = processor.mkCombinedDocument(sentenceGroups, trailers, keepText = true)
val actualResult = toString(document)

actualResult should be(expectedResult)
}
}

{
val document = processor.mkDocument(sentences.mkString(separator), keepText = true)
val expectedResult = toString(document)

manySentenceLengths.foreach { sentenceLengths =>
test(sentenceLengths, expectedResult)
}
}

behavior of "dynamically separated texts"

it should "include separators in both text and words" in {
val text = "I found this text<br>on a web page."
val separator = "<br>"
val texts = text.split(separator)
val dirtyTexts = texts.zipWithIndex.map { case (text, index) =>
if (index != texts.indices.last) text + separator
else text
}
val indices = texts.indices
val trailers = indices.map { _ => "" }
val document = processor.mkCombinedDocument(dirtyTexts, trailers, keepText = true)

document.text.get should be (text)
document.sentences.length should be (indices.length)

document.sentences.zipWithIndex.foreach { case (sentence, index) =>
if (index != indices.last)
sentence.words should contain (separator)
else
sentence.words should not contain (separator)
}
}

// This is thought to be the standard case.
it should "include separators in text but not words" in {
val text = "I found this text<br>on a web page."
val separator = "<br>"
val texts = text.split(separator)
val indices = texts.indices
val trailers = indices.map { index => if (index != indices.last) separator else "" }
val document = processor.mkCombinedDocument(texts, trailers, keepText = true)

document.text.get should be (text)
document.sentences.length should be (indices.length)

document.sentences.foreach { sentence =>
sentence.words should not contain(separator)
}
}

it should "include separators in neither text nor words" in {
val text = "I found this text<br>on a web page."
val separator = "<br>"
val cleanSeparator = " "
val cleanText = text.replace(separator, cleanSeparator)
val texts = text.split(separator)
val indices = texts.indices
val trailers = indices.map { index => if (index != indices.last) cleanSeparator else "" }
val document = processor.mkCombinedDocument(texts, trailers, keepText = true)

document.text.get should be(cleanText)
document.sentences.length should be(indices.length)

document.sentences.foreach { sentence =>
sentence.words should not contain (separator)
}
}
}