Skip to content
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 2 additions & 21 deletions codepropertygraph/src/main/scala/io/shiftleft/utils/IOUtils.scala
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ import scala.util.Using

object IOUtils {

private val surrogatePattern: Pattern = Pattern.compile("[^\u0000-\uffff]")

private val boms: Set[Char] = Set(
'\uefbb', // UTF-8
'\ufeff', // UTF-16 (BE)
Expand Down Expand Up @@ -39,27 +37,10 @@ object IOUtils {
}
}

/** Java strings are stored as sequences of 16-bit chars, but what they represent is sequences of unicode characters.
* In unicode terminology, they are stored as code units, but model code points. Thus, it's somewhat meaningless to
* talk about removing surrogates, which don't exist in the character / code point representation (unless you have
* rogue single surrogates, in which case you have other problems). Rather, what you want to do is to remove any
* characters which will require surrogates when encoded. That means any character which lies beyond the basic
* multilingual plane. You can do that with a simple regular expression.
*/
private def replaceUnpairedSurrogates(input: String): String = {
val matches = surrogatePattern.matcher(input)
if (matches.find()) {
val size = matches.end() - matches.start()
matches.replaceAll("?" * size)
} else {
input
}
}

private def contentFromBufferedSource(bufferedSource: BufferedSource): Seq[String] = {
val reader = bufferedSource.bufferedReader()
skipBOMIfPresent(reader)
reader.lines().iterator().asScala.map(replaceUnpairedSurrogates).toSeq
reader.lines().iterator().asScala.toSeq
}

private def contentStringFromBufferedSource(bufferedSource: BufferedSource): String = {
Expand All @@ -78,7 +59,7 @@ object IOUtils {
}
}

replaceUnpairedSurrogates(stringBuilder.toString)
stringBuilder.toString
}

/** Reads a file at the given path and:
Expand Down
Loading