diff --git a/.gitignore b/.gitignore new file mode 100644 index 00000000..e510fa99 --- /dev/null +++ b/.gitignore @@ -0,0 +1,10 @@ +*.iml +.gradle +.idea +.DS_Store +build +captures +.externalNativeBuild +.cxx +local.properties +xcuserdata \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 00000000..2e724481 --- /dev/null +++ b/README.md @@ -0,0 +1,54 @@ +# Ksoup: Kotlin Multiplatform HTML Parser + +**Ksoup** is a Kotlin Multiplatform library for working with real-world HTML and XML. It's a port of the renowned Java library, **jsoup**, and offers an easy-to-use API for URL fetching, data parsing, extraction, and manipulation using DOM, CSS, and xpath selectors. + +Ksoup implements the [WHATWG HTML5](https://html.spec.whatwg.org/multipage/) specification, parsing HTML to the same DOM as modern browsers do, but with support for Android, JVM, and native platforms. + +## Features +- Scrape and parse HTML from a URL, file, or string +- Find and extract data using DOM traversal or CSS selectors +- Manipulate HTML elements, attributes, and text +- Clean user-submitted content against a safe-list to prevent XSS attacks +- Output tidy HTML + +Ksoup is adept at handling all varieties of HTML found in the wild. + +## Current Limitations +As of now, Ksoup does not implement the connection cookies and servlet-related features of jsoup. This is an area under consideration for future development. + +## Multiplatform Support +- **Android**: Extensive support for Android development. +- **JVM**: Compatible with Java Virtual Machine environments. +- **Native**: Supports native platform development. + +## Open source +Ksoup is an open source project, a Kotlin Multiplatform port of jsoup, distributed under the MIT license. The source code of Ksoup is available on [GitHub](https://github.com/fleeksoft/ksoup). + +## Getting started +- Add the library to dependencies: + +## Gradle +```kotlin +// for kotlin multiplatform +commonMain { + dependencies { + implementation("com.fleeksoft:ksoup:0.0.1") + } +} +``` + +## Development and Support +For questions, ideas, or contributions regarding Ksoup, please contact us via [email](mailto:fleeksoft@gmail.com). + +Report any issues on [our GitHub page](https://github.com/fleeksoft/ksoup/issues), ensuring to check for duplicates beforehand. + +## Status +Ksoup is in a stable release phase, continually evolving from its jsoup origins. + + + +## Status with Jsoup +Updated with the main branch of Jsoup on GitHub [c46870c266b0c1112f4b1d423cf6dd9290d04d2f] as of 14 November 2023. + +## ROAD MAP +- Publish for swift or xcode framework using kmmbridge \ No newline at end of file diff --git a/build.gradle.kts b/build.gradle.kts new file mode 100644 index 00000000..12d80f7e --- /dev/null +++ b/build.gradle.kts @@ -0,0 +1,5 @@ +plugins { + //trick: for the same plugin versions in all sub-modules + alias(libs.plugins.androidLibrary).apply(false) + alias(libs.plugins.kotlinMultiplatform).apply(false) +} diff --git a/gradle.properties b/gradle.properties new file mode 100644 index 00000000..dbfd5e46 --- /dev/null +++ b/gradle.properties @@ -0,0 +1,13 @@ +#Gradle +org.gradle.jvmargs=-Xmx2048M -Dfile.encoding=UTF-8 -Dkotlin.daemon.jvm.options\="-Xmx2048M" +org.gradle.caching=true +org.gradle.configuration-cache=true + +#Kotlin +kotlin.code.style=official + +#Android +android.useAndroidX=true +android.nonTransitiveRClass=true + +#kotlin.experimental.tryK2=true \ No newline at end of file diff --git a/gradle/libs.versions.toml b/gradle/libs.versions.toml new file mode 100644 index 00000000..07d94ff8 --- /dev/null +++ b/gradle/libs.versions.toml @@ -0,0 +1,40 @@ +[versions] +agp = "8.1.3" +kotlin = "1.9.20" +junitJupiter = "5.9.3" +compose = "1.5.4" +compose-compiler = "1.5.4" +compose-material3 = "1.1.2" +androidx-activityCompose = "1.8.0" +ktor = "2.3.6" +okio = "3.6.0" +kotlinxDatetime = "0.4.1" +kotlinx-io = "0.3.0" +codepoints = "0.6.1" +gson = "2.10.1" + +[libraries] +junit-jupiter = { module = "org.junit.jupiter:junit-jupiter", version.ref = "junitJupiter" } +kotlin-test = { module = "org.jetbrains.kotlin:kotlin-test", version.ref = "kotlin" } +androidx-activity-compose = { module = "androidx.activity:activity-compose", version.ref = "androidx-activityCompose" } +compose-ui = { module = "androidx.compose.ui:ui", version.ref = "compose" } +compose-ui-tooling = { module = "androidx.compose.ui:ui-tooling", version.ref = "compose" } +compose-ui-tooling-preview = { module = "androidx.compose.ui:ui-tooling-preview", version.ref = "compose" } +compose-foundation = { module = "androidx.compose.foundation:foundation", version.ref = "compose" } +compose-material3 = { module = "androidx.compose.material3:material3", version.ref = "compose-material3" } +ktor-core = { module = "io.ktor:ktor-client-core", version.ref = "ktor" } +ktor-client-logging = { module = "io.ktor:ktor-client-logging", version.ref = "ktor" } +ktor-client-darwin = { module = "io.ktor:ktor-client-darwin", version.ref = "ktor" } +ktor-client-okhttp = { module = "io.ktor:ktor-client-okhttp", version.ref = "ktor" } +okio = { module = "com.squareup.okio:okio", version.ref = "okio" } +kotlinx-datetime = { module = "org.jetbrains.kotlinx:kotlinx-datetime", version.ref = "kotlinxDatetime" } +kotlinx-io = { module = "org.jetbrains.kotlinx:kotlinx-io-core", version.ref = "kotlinx-io" } +codepoints = { module = "de.cketti.unicode:kotlin-codepoints-deluxe", version.ref = "codepoints" } +gson = { module = "com.google.code.gson:gson", version.ref = "gson" } + +[plugins] +androidApplication = { id = "com.android.application", version.ref = "agp" } +androidLibrary = { id = "com.android.library", version.ref = "agp" } +kotlinAndroid = { id = "org.jetbrains.kotlin.android", version.ref = "kotlin" } +kotlinMultiplatform = { id = "org.jetbrains.kotlin.multiplatform", version.ref = "kotlin" } +kotlinCocoapods = { id = "org.jetbrains.kotlin.native.cocoapods", version.ref = "kotlin" } diff --git a/gradle/wrapper/gradle-wrapper.properties b/gradle/wrapper/gradle-wrapper.properties new file mode 100644 index 00000000..3fa8f862 --- /dev/null +++ b/gradle/wrapper/gradle-wrapper.properties @@ -0,0 +1,7 @@ +distributionBase=GRADLE_USER_HOME +distributionPath=wrapper/dists +distributionUrl=https\://services.gradle.org/distributions/gradle-8.4-bin.zip +networkTimeout=10000 +validateDistributionUrl=true +zipStoreBase=GRADLE_USER_HOME +zipStorePath=wrapper/dists diff --git a/gradlew b/gradlew new file mode 100755 index 00000000..1aa94a42 --- /dev/null +++ b/gradlew @@ -0,0 +1,249 @@ +#!/bin/sh + +# +# Copyright © 2015-2021 the original authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# https://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +############################################################################## +# +# Gradle start up script for POSIX generated by Gradle. +# +# Important for running: +# +# (1) You need a POSIX-compliant shell to run this script. If your /bin/sh is +# noncompliant, but you have some other compliant shell such as ksh or +# bash, then to run this script, type that shell name before the whole +# command line, like: +# +# ksh Gradle +# +# Busybox and similar reduced shells will NOT work, because this script +# requires all of these POSIX shell features: +# * functions; +# * expansions «$var», «${var}», «${var:-default}», «${var+SET}», +# «${var#prefix}», «${var%suffix}», and «$( cmd )»; +# * compound commands having a testable exit status, especially «case»; +# * various built-in commands including «command», «set», and «ulimit». +# +# Important for patching: +# +# (2) This script targets any POSIX shell, so it avoids extensions provided +# by Bash, Ksh, etc; in particular arrays are avoided. +# +# The "traditional" practice of packing multiple parameters into a +# space-separated string is a well documented source of bugs and security +# problems, so this is (mostly) avoided, by progressively accumulating +# options in "$@", and eventually passing that to Java. +# +# Where the inherited environment variables (DEFAULT_JVM_OPTS, JAVA_OPTS, +# and GRADLE_OPTS) rely on word-splitting, this is performed explicitly; +# see the in-line comments for details. +# +# There are tweaks for specific operating systems such as AIX, CygWin, +# Darwin, MinGW, and NonStop. +# +# (3) This script is generated from the Groovy template +# https://github.com/gradle/gradle/blob/HEAD/subprojects/plugins/src/main/resources/org/gradle/api/internal/plugins/unixStartScript.txt +# within the Gradle project. +# +# You can find Gradle at https://github.com/gradle/gradle/. +# +############################################################################## + +# Attempt to set APP_HOME + +# Resolve links: $0 may be a link +app_path=$0 + +# Need this for daisy-chained symlinks. +while + APP_HOME=${app_path%"${app_path##*/}"} # leaves a trailing /; empty if no leading path + [ -h "$app_path" ] +do + ls=$( ls -ld "$app_path" ) + link=${ls#*' -> '} + case $link in #( + /*) app_path=$link ;; #( + *) app_path=$APP_HOME$link ;; + esac +done + +# This is normally unused +# shellcheck disable=SC2034 +APP_BASE_NAME=${0##*/} +# Discard cd standard output in case $CDPATH is set (https://github.com/gradle/gradle/issues/25036) +APP_HOME=$( cd "${APP_HOME:-./}" > /dev/null && pwd -P ) || exit + +# Use the maximum available, or set MAX_FD != -1 to use that value. +MAX_FD=maximum + +warn () { + echo "$*" +} >&2 + +die () { + echo + echo "$*" + echo + exit 1 +} >&2 + +# OS specific support (must be 'true' or 'false'). +cygwin=false +msys=false +darwin=false +nonstop=false +case "$( uname )" in #( + CYGWIN* ) cygwin=true ;; #( + Darwin* ) darwin=true ;; #( + MSYS* | MINGW* ) msys=true ;; #( + NONSTOP* ) nonstop=true ;; +esac + +CLASSPATH=$APP_HOME/gradle/wrapper/gradle-wrapper.jar + + +# Determine the Java command to use to start the JVM. +if [ -n "$JAVA_HOME" ] ; then + if [ -x "$JAVA_HOME/jre/sh/java" ] ; then + # IBM's JDK on AIX uses strange locations for the executables + JAVACMD=$JAVA_HOME/jre/sh/java + else + JAVACMD=$JAVA_HOME/bin/java + fi + if [ ! -x "$JAVACMD" ] ; then + die "ERROR: JAVA_HOME is set to an invalid directory: $JAVA_HOME + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +else + JAVACMD=java + if ! command -v java >/dev/null 2>&1 + then + die "ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. + +Please set the JAVA_HOME variable in your environment to match the +location of your Java installation." + fi +fi + +# Increase the maximum file descriptors if we can. +if ! "$cygwin" && ! "$darwin" && ! "$nonstop" ; then + case $MAX_FD in #( + max*) + # In POSIX sh, ulimit -H is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + MAX_FD=$( ulimit -H -n ) || + warn "Could not query maximum file descriptor limit" + esac + case $MAX_FD in #( + '' | soft) :;; #( + *) + # In POSIX sh, ulimit -n is undefined. That's why the result is checked to see if it worked. + # shellcheck disable=SC2039,SC3045 + ulimit -n "$MAX_FD" || + warn "Could not set maximum file descriptor limit to $MAX_FD" + esac +fi + +# Collect all arguments for the java command, stacking in reverse order: +# * args from the command line +# * the main class name +# * -classpath +# * -D...appname settings +# * --module-path (only if needed) +# * DEFAULT_JVM_OPTS, JAVA_OPTS, and GRADLE_OPTS environment variables. + +# For Cygwin or MSYS, switch paths to Windows format before running java +if "$cygwin" || "$msys" ; then + APP_HOME=$( cygpath --path --mixed "$APP_HOME" ) + CLASSPATH=$( cygpath --path --mixed "$CLASSPATH" ) + + JAVACMD=$( cygpath --unix "$JAVACMD" ) + + # Now convert the arguments - kludge to limit ourselves to /bin/sh + for arg do + if + case $arg in #( + -*) false ;; # don't mess with options #( + /?*) t=${arg#/} t=/${t%%/*} # looks like a POSIX filepath + [ -e "$t" ] ;; #( + *) false ;; + esac + then + arg=$( cygpath --path --ignore --mixed "$arg" ) + fi + # Roll the args list around exactly as many times as the number of + # args, so each arg winds up back in the position where it started, but + # possibly modified. + # + # NB: a `for` loop captures its iteration list before it begins, so + # changing the positional parameters here affects neither the number of + # iterations, nor the values presented in `arg`. + shift # remove old arg + set -- "$@" "$arg" # push replacement arg + done +fi + + +# Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +DEFAULT_JVM_OPTS='"-Xmx64m" "-Xms64m"' + +# Collect all arguments for the java command: +# * DEFAULT_JVM_OPTS, JAVA_OPTS, JAVA_OPTS, and optsEnvironmentVar are not allowed to contain shell fragments, +# and any embedded shellness will be escaped. +# * For example: A user cannot expect ${Hostname} to be expanded, as it is an environment variable and will be +# treated as '${Hostname}' itself on the command line. + +set -- \ + "-Dorg.gradle.appname=$APP_BASE_NAME" \ + -classpath "$CLASSPATH" \ + org.gradle.wrapper.GradleWrapperMain \ + "$@" + +# Stop when "xargs" is not available. +if ! command -v xargs >/dev/null 2>&1 +then + die "xargs is not available" +fi + +# Use "xargs" to parse quoted args. +# +# With -n1 it outputs one arg per line, with the quotes and backslashes removed. +# +# In Bash we could simply go: +# +# readarray ARGS < <( xargs -n1 <<<"$var" ) && +# set -- "${ARGS[@]}" "$@" +# +# but POSIX shell has neither arrays nor command substitution, so instead we +# post-process each arg (as a line of input to sed) to backslash-escape any +# character that might be a shell metacharacter, then use eval to reverse +# that process (while maintaining the separation between arguments), and wrap +# the whole thing up as a single "set" statement. +# +# This will of course break if any of these variables contains a newline or +# an unmatched quote. +# + +eval "set -- $( + printf '%s\n' "$DEFAULT_JVM_OPTS $JAVA_OPTS $GRADLE_OPTS" | + xargs -n1 | + sed ' s~[^-[:alnum:]+,./:=@_]~\\&~g; ' | + tr '\n' ' ' + )" '"$@"' + +exec "$JAVACMD" "$@" diff --git a/gradlew.bat b/gradlew.bat new file mode 100644 index 00000000..93e3f59f --- /dev/null +++ b/gradlew.bat @@ -0,0 +1,92 @@ +@rem +@rem Copyright 2015 the original author or authors. +@rem +@rem Licensed under the Apache License, Version 2.0 (the "License"); +@rem you may not use this file except in compliance with the License. +@rem You may obtain a copy of the License at +@rem +@rem https://www.apache.org/licenses/LICENSE-2.0 +@rem +@rem Unless required by applicable law or agreed to in writing, software +@rem distributed under the License is distributed on an "AS IS" BASIS, +@rem WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +@rem See the License for the specific language governing permissions and +@rem limitations under the License. +@rem + +@if "%DEBUG%"=="" @echo off +@rem ########################################################################## +@rem +@rem Gradle startup script for Windows +@rem +@rem ########################################################################## + +@rem Set local scope for the variables with windows NT shell +if "%OS%"=="Windows_NT" setlocal + +set DIRNAME=%~dp0 +if "%DIRNAME%"=="" set DIRNAME=. +@rem This is normally unused +set APP_BASE_NAME=%~n0 +set APP_HOME=%DIRNAME% + +@rem Resolve any "." and ".." in APP_HOME to make it shorter. +for %%i in ("%APP_HOME%") do set APP_HOME=%%~fi + +@rem Add default JVM options here. You can also use JAVA_OPTS and GRADLE_OPTS to pass JVM options to this script. +set DEFAULT_JVM_OPTS="-Xmx64m" "-Xms64m" + +@rem Find java.exe +if defined JAVA_HOME goto findJavaFromJavaHome + +set JAVA_EXE=java.exe +%JAVA_EXE% -version >NUL 2>&1 +if %ERRORLEVEL% equ 0 goto execute + +echo. +echo ERROR: JAVA_HOME is not set and no 'java' command could be found in your PATH. +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:findJavaFromJavaHome +set JAVA_HOME=%JAVA_HOME:"=% +set JAVA_EXE=%JAVA_HOME%/bin/java.exe + +if exist "%JAVA_EXE%" goto execute + +echo. +echo ERROR: JAVA_HOME is set to an invalid directory: %JAVA_HOME% +echo. +echo Please set the JAVA_HOME variable in your environment to match the +echo location of your Java installation. + +goto fail + +:execute +@rem Setup the command line + +set CLASSPATH=%APP_HOME%\gradle\wrapper\gradle-wrapper.jar + + +@rem Execute Gradle +"%JAVA_EXE%" %DEFAULT_JVM_OPTS% %JAVA_OPTS% %GRADLE_OPTS% "-Dorg.gradle.appname=%APP_BASE_NAME%" -classpath "%CLASSPATH%" org.gradle.wrapper.GradleWrapperMain %* + +:end +@rem End local scope for the variables with windows NT shell +if %ERRORLEVEL% equ 0 goto mainEnd + +:fail +rem Set variable GRADLE_EXIT_CONSOLE if you need the _script_ return code instead of +rem the _cmd.exe /c_ return code! +set EXIT_CODE=%ERRORLEVEL% +if %EXIT_CODE% equ 0 set EXIT_CODE=1 +if not ""=="%GRADLE_EXIT_CONSOLE%" exit %EXIT_CODE% +exit /b %EXIT_CODE% + +:mainEnd +if "%OS%"=="Windows_NT" endlocal + +:omega diff --git a/ksoup/build.gradle.kts b/ksoup/build.gradle.kts new file mode 100644 index 00000000..fe6e4ffc --- /dev/null +++ b/ksoup/build.gradle.kts @@ -0,0 +1,86 @@ +plugins { + alias(libs.plugins.kotlinMultiplatform) + alias(libs.plugins.androidLibrary) + id("maven-publish") +} + +group = "com.fleeksoft" +version = "0.0.1" + +kotlin { + jvm() + + androidTarget { + compilations.all { + kotlinOptions { + jvmTarget = "17" + } + } + } + + listOf( + iosX64(), + iosArm64(), + iosSimulatorArm64() + ).forEach { + it.binaries.framework { + baseName = "ksoup" + isStatic = true + } + } + + sourceSets { + commonMain.dependencies { + //put your multiplatform dependencies here + implementation(libs.ktor.core) + + implementation(libs.kotlinx.datetime) +// implementation(libs.kotlinx.io) + implementation(libs.codepoints) + implementation(libs.okio) + } + commonTest.dependencies { + implementation(libs.kotlin.test) + implementation(libs.gson) + } + + jvmTest.dependencies { + implementation(libs.kotlin.test) + } + } +} + +android { + namespace = "com.fleeksoft.ksoup" + compileSdk = 34 + defaultConfig { + minSdk = 21 + } + compileOptions { + sourceCompatibility = JavaVersion.VERSION_17 + targetCompatibility = JavaVersion.VERSION_17 + } +} + +publishing { + + publications { + + } + + repositories { + maven { + + } + } +} + +tasks.named("publish").configure { + dependsOn("jvmTest") +// testDebugUnitTest +// iosSimulatorArm64Test +} + +tasks.named("publishToMavenLocal").configure { + dependsOn("iosSimulatorArm64Test") +} \ No newline at end of file diff --git a/ksoup/src/androidMain/kotlin/com/fleeksoft/ksoup/Platform.android.kt b/ksoup/src/androidMain/kotlin/com/fleeksoft/ksoup/Platform.android.kt new file mode 100644 index 00000000..ecf8f4b0 --- /dev/null +++ b/ksoup/src/androidMain/kotlin/com/fleeksoft/ksoup/Platform.android.kt @@ -0,0 +1,17 @@ +package com.fleeksoft.ksoup + +import okio.BufferedSource +import okio.FileSystem +import okio.GzipSource +import okio.Path +import okio.buffer + +actual fun readGzipFile(file: Path): BufferedSource { + val fileSource = FileSystem.SYSTEM.source(file) + return GzipSource(source = fileSource).buffer() +} + + +actual fun readFile(file: Path): BufferedSource { + return FileSystem.SYSTEM.source(file).buffer() +} \ No newline at end of file diff --git a/ksoup/src/androidUnitTest/kotlin/com/fleeksoft/ksoup/Test.android.kt b/ksoup/src/androidUnitTest/kotlin/com/fleeksoft/ksoup/Test.android.kt new file mode 100644 index 00000000..b78615f3 --- /dev/null +++ b/ksoup/src/androidUnitTest/kotlin/com/fleeksoft/ksoup/Test.android.kt @@ -0,0 +1,12 @@ +package com.fleeksoft.ksoup + +import org.junit.Assert.assertTrue +import kotlin.test.Test + +class AndroidGreetingTest { + + @Test + fun testExample() { + + } +} \ No newline at end of file diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/Ksoup.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/Ksoup.kt new file mode 100644 index 00000000..1608154d --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/Ksoup.kt @@ -0,0 +1,365 @@ +package com.fleeksoft.ksoup + +import com.fleeksoft.ksoup.helper.DataUtil +import com.fleeksoft.ksoup.helper.NetworkHelper +import com.fleeksoft.ksoup.nodes.Document +import com.fleeksoft.ksoup.nodes.Element +import com.fleeksoft.ksoup.parser.Parser +import com.fleeksoft.ksoup.ported.BufferReader +import com.fleeksoft.ksoup.safety.Cleaner +import com.fleeksoft.ksoup.safety.Safelist +import io.ktor.client.statement.* +import kotlinx.coroutines.runBlocking +import okio.IOException +import okio.Path.Companion.toPath + +/** + * The core public access point to the com.fleeksoft.ksoup functionality. + * + * @author Sabeeh + */ +object Ksoup { + /** + * Parse HTML into a Document. The parser will make a sensible, balanced document tree out of any HTML. + * + * @param html HTML to parse + * @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur + * before the HTML declares a `` tag. + * @return sane HTML + */ + fun parse(html: String, baseUri: String): Document { + return Parser.parse(html, baseUri) + } + + /** + * Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML + * (non-HTML) parser. + * + * @param html HTML to parse + * @param baseUri The URL where the HTML was retrieved from. Used to resolve relative URLs to absolute URLs, that occur + * before the HTML declares a `` tag. + * @param parser alternate [parser][Parser.xmlParser] to use. + * @return sane HTML + */ + fun parse(html: String, baseUri: String, parser: Parser): Document { + return parser.parseInput(html, baseUri) + } + + /** + * Parse HTML into a Document, using the provided Parser. You can provide an alternate parser, such as a simple XML + * (non-HTML) parser. As no base URI is specified, absolute URL resolution, if required, relies on the HTML including + * a `` tag. + * + * @param html HTML to parse + * before the HTML declares a `` tag. + * @param parser alternate [parser][Parser.xmlParser] to use. + * @return sane HTML + */ + fun parse(html: String, parser: Parser): Document { + return parser.parseInput(html, "") + } + + /** + * Parse HTML into a Document. As no base URI is specified, absolute URL resolution, if required, relies on the HTML + * including a `` tag. + * + * @param html HTML to parse + * @return sane HTML + * @see .parse + */ + fun parse(html: String): Document { + return Parser.parse(html, "") + } + + /** + * Creates a new connection, with the defined request URL. Use to fetch and parse a HTML page. + * + * + * Use examples: + * + * * `Document doc = Jsoup.connect("http://example.com").userAgent("Mozilla").data("name", "com.fleeksoft.ksoup").get();` + * * `Document doc = Jsoup.connect("http://example.com").cookie("auth", "token").post();` + * + * @param url URL to connect to. The protocol must be `http` or `https`. + * @return the connection. You can add data, cookies, and headers; set the user-agent, referrer, method; and then execute. + * + */ + fun connect( + url: String, + headers: Map = mapOf(), + parser: Parser = Parser.htmlParser(), + ): Document { + val result: String = + runBlocking { NetworkHelper.instance.get(url, headers = headers).bodyAsText() } + return parse(result, parser) + } + + /** + * Parse the contents of a file as HTML. + * + * @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). + * @param charsetName (optional) character set of file contents. Set to `null` to determine from `http-equiv` meta tag, if + * present, or fall back to `UTF-8` (which is often safe to do). + * @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + * @return sane HTML + * @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + @Throws(IOException::class) + fun parseFile(file: String, baseUri: String, charsetName: String? = null): Document { + val filePath = file.toPath() + return DataUtil.load(filePath, charsetName, baseUri) + } + + /** + * Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. + * + * @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). + * @param charsetName (optional) character set of file contents. Set to `null` to determine from `http-equiv` meta tag, if + * present, or fall back to `UTF-8` (which is often safe to do). + * @return sane HTML + * @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + * @see .parse + */ + @Throws(IOException::class) + fun parseFile(file: String, charsetName: String? = null): Document { + val filePath = file.toPath() + return DataUtil.load(filePath, charsetName, filePath.toString()) + } + + /** + * Parse the contents of a file as HTML. The location of the file is used as the base URI to qualify relative URLs. + * The charset used to read the file will be determined by the byte-order-mark (BOM), or a `` tag, + * or if neither is present, will be `UTF-8`. + * + * + * This is the equivalent of calling [parse(file, null)][.parse] + * + * @param file the file to load HTML from. Supports gzipped files (ending in .z or .gz). + * @return sane HTML + * @throws IOException if the file could not be found or read. + * @see .parse + * @since 1.15.1 + */ + @Throws(IOException::class) + fun parseFile(file: String): Document { + val filePath = file.toPath() + return DataUtil.load(filePath, null, filePath.toString()) + } + + /** + * Parse the contents of a file as HTML. + * + * @param file file to load HTML from. Supports gzipped files (ending in .z or .gz). + * @param charsetName (optional) character set of file contents. Set to `null` to determine from `http-equiv` meta tag, if + * present, or fall back to `UTF-8` (which is often safe to do). + * @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + * @param parser alternate [parser][Parser.xmlParser] to use. + * @return sane HTML + * @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + * @since 1.14.2 + */ + @Throws(IOException::class) + fun parseFile( + file: String, + charsetName: String?, + baseUri: String, + parser: Parser, + ): Document { + return DataUtil.load(file.toPath(), charsetName, baseUri, parser) + } + + /** + * Read an input stream, and parse it to a Document. + * + * @param `in` input stream to read. The stream will be closed after reading. + * @param charsetName (optional) character set of file contents. Set to `null` to determine from `http-equiv` meta tag, if + * present, or fall back to `UTF-8` (which is often safe to do). + * @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + * @return sane HTML + * @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + @Throws(IOException::class) + fun parse( + bufferReader: BufferReader, + charsetName: String?, + baseUri: String, + ): Document { + return DataUtil.load(bufferReader, charsetName, baseUri) + } + + /** + * Read an input stream, and parse it to a Document. You can provide an alternate parser, such as a simple XML + * (non-HTML) parser. + * + * @param `in` input stream to read. Make sure to close it after parsing. + * @param charsetName (optional) character set of file contents. Set to `null` to determine from `http-equiv` meta tag, if + * present, or fall back to `UTF-8` (which is often safe to do). + * @param baseUri The URL where the HTML was retrieved from, to resolve relative links against. + * @param parser alternate [parser][Parser.xmlParser] to use. + * @return sane HTML + * @throws IOException if the file could not be found, or read, or if the charsetName is invalid. + */ + @Throws(IOException::class) + fun parse( + bufferReader: BufferReader, + charsetName: String?, + baseUri: String, + parser: Parser, + ): Document { + return DataUtil.load(bufferReader, charsetName, baseUri, parser) + } + + /** + * Parse a fragment of HTML, with the assumption that it forms the `body` of the HTML. + * + * @param bodyHtml body HTML fragment + * @param baseUri URL to resolve relative URLs against. + * @return sane HTML document + * @see Document.body + */ + fun parseBodyFragment(bodyHtml: String, baseUri: String?): Document { + return Parser.parseBodyFragment(bodyHtml, baseUri) + } + + /** + * Parse a fragment of HTML, with the assumption that it forms the `body` of the HTML. + * + * @param bodyHtml body HTML fragment + * @return sane HTML document + * @see Document.body + */ + fun parseBodyFragment(bodyHtml: String): Document { + return Parser.parseBodyFragment(bodyHtml, "") + } + + /** + * Fetch a URL, and parse it as HTML. Provided for compatibility; in most cases use [.connect] instead. + * + * + * The encoding character set is determined by the content-type header or http-equiv meta tag, or falls back to `UTF-8`. + * + * @param url URL to fetch (with a GET). The protocol must be `http` or `https`. + * @param timeoutMillis Connection and read timeout, in milliseconds. If exceeded, IOException is thrown. + * @return The parsed HTML. + * @throws java.net.MalformedURLException if the request URL is not a HTTP or HTTPS URL, or is otherwise malformed + * @throws HttpStatusException if the response is not OK and HTTP response errors are not ignored + * @throws UnsupportedMimeTypeException if the response mime type is not supported and those errors are not ignored + * @throws java.net.SocketTimeoutException if the connection times out + * @throws IOException if a connection or read error occurs + * @see .connect + */ + /*@Throws(IOException::class) + fun parse(url: java.net.URL?, timeoutMillis: Int): Document { + val con: Connection = HttpConnection.connect(url) + con.timeout(timeoutMillis) + return con.get() + }*/ + + /** + * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through an allow-list of safe + * tags and attributes. + * + * @param bodyHtml input untrusted HTML (body fragment) + * @param baseUri URL to resolve relative URLs against + * @param safelist list of permitted HTML elements + * @return safe HTML (body fragment) + * @see Cleaner.clean + */ + fun clean(bodyHtml: String, baseUri: String?, safelist: Safelist): String { + val dirty: Document = parseBodyFragment(bodyHtml, baseUri) + val cleaner = Cleaner(safelist) + val clean: Document = cleaner.clean(dirty) + return clean.body().html() + } + + /** + * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of permitted + * tags and attributes. + * + * + * Note that as this method does not take a base href URL to resolve attributes with relative URLs against, those + * URLs will be removed, unless the input HTML contains a ` tag`. If you wish to preserve those, use + * the [Ksoup.clean] method instead, and enable + * [Safelist.preserveRelativeLinks]. + * + * + * Note that the output of this method is still **HTML** even when using the TextNode only + * [Safelist.none], and so any HTML entities in the output will be appropriately escaped. + * If you want plain text, not HTML, you should use a text method such as [Element.text] instead, after + * cleaning the document. + * + * Example: + *
`String sourceBodyHtml = "

5 is < 6.

"; + * String html = Jsoup.clean(sourceBodyHtml, Safelist.none()); + * + * Cleaner cleaner = new Cleaner(Safelist.none()); + * String text = cleaner.clean(Jsoup.parse(sourceBodyHtml)).text(); + * + * // html is: 5 is < 6. + * // text is: 5 is < 6. + `
* + * + * @param bodyHtml input untrusted HTML (body fragment) + * @param safelist list of permitted HTML elements + * @return safe HTML (body fragment) + * @see Cleaner.clean + */ + fun clean(bodyHtml: String, safelist: Safelist): String { + return clean(bodyHtml, "", safelist) + } + + /** + * Get safe HTML from untrusted input HTML, by parsing input HTML and filtering it through a safe-list of + * permitted tags and attributes. + * + * The HTML is treated as a body fragment; it's expected the cleaned HTML will be used within the body of an + * existing document. If you want to clean full documents, use [Cleaner.clean] instead, and add + * structural tags (`html, head, body` etc) to the safelist. + * + * @param bodyHtml input untrusted HTML (body fragment) + * @param baseUri URL to resolve relative URLs against + * @param safelist list of permitted HTML elements + * @param outputSettings document output settings; use to control pretty-printing and entity escape modes + * @return safe HTML (body fragment) + * @see Cleaner.clean + */ + fun clean( + bodyHtml: String, + baseUri: String?, + safelist: Safelist, + outputSettings: Document.OutputSettings, + ): String { + val dirty: Document = parseBodyFragment(bodyHtml, baseUri) + val cleaner = Cleaner(safelist) + val clean: Document = cleaner.clean(dirty) + clean.outputSettings(outputSettings) + return clean.body().html() + } + + /** + * Test if the input body HTML has only tags and attributes allowed by the Safelist. Useful for form validation. + * + * + * This method is intended to be used in a user interface as a validator for user input. Note that regardless of the + * output of this method, the input document **must always** be normalized using a method such as + * [.clean], and the result of that method used to store or serialize the document + * before later reuse such as presentation to end users. This ensures that enforced attributes are set correctly, and + * that any differences between how a given browser and how com.fleeksoft.ksoup parses the input HTML are normalized. + * + * + * Example: + *
`Safelist safelist = Safelist.relaxed();
+     * boolean isValid = Jsoup.isValid(sourceBodyHtml, safelist);
+     * String normalizedHtml = Jsoup.clean(sourceBodyHtml, "https://example.com/", safelist);
+    `
* + * + * Assumes the HTML is a body fragment (i.e. will be used in an existing HTML document body.) + * @param bodyHtml HTML to test + * @param safelist safelist to test against + * @return true if no tags or attributes were removed; false otherwise + * @see .clean + */ + fun isValid(bodyHtml: String, safelist: Safelist): Boolean { + return Cleaner(safelist).isValidBodyHtml(bodyHtml) + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/Platform.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/Platform.kt new file mode 100644 index 00000000..42ba0509 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/Platform.kt @@ -0,0 +1,9 @@ +package com.fleeksoft.ksoup + +import okio.BufferedSource +import okio.Path + +expect fun readGzipFile(file: Path): BufferedSource + + +expect fun readFile(file: Path): BufferedSource \ No newline at end of file diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/SerializationException.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/SerializationException.kt new file mode 100644 index 00000000..c2033c91 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/SerializationException.kt @@ -0,0 +1,40 @@ +package com.fleeksoft.ksoup + +/** + * A SerializationException is raised whenever serialization of a DOM element fails. This exception usually wraps an + * [IOException] that may be thrown due to an inaccessible output stream. + */ +class SerializationException : RuntimeException { + /** + * Creates and initializes a new serialization exception with no error message and cause. + */ + constructor() : super() + + /** + * Creates and initializes a new serialization exception with the given error message and no cause. + * + * @param message + * the error message of the new serialization exception (may be `null`). + */ + constructor(message: String?) : super(message) + + /** + * Creates and initializes a new serialization exception with the specified cause and an error message of + * `(cause==null ? null : cause.toString())` (which typically contains the class and error message of + * `cause`). + * + * @param cause + * the cause of the new serialization exception (may be `null`). + */ + constructor(cause: Throwable?) : super(cause) + + /** + * Creates and initializes a new serialization exception with the given error message and cause. + * + * @param message + * the error message of the new serialization exception. + * @param cause + * the cause of the new serialization exception. + */ + constructor(message: String?, cause: Throwable?) : super(message, cause) +} \ No newline at end of file diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/UncheckedIOException.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/UncheckedIOException.kt new file mode 100644 index 00000000..b8c58b65 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/UncheckedIOException.kt @@ -0,0 +1,12 @@ +package com.fleeksoft.ksoup + +import okio.IOException + +class UncheckedIOException : Exception { + constructor(cause: IOException?) : super(cause) + constructor(message: String?) : super(IOException(message)) + + fun ioException(): Throwable? { + return cause + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/UnsupportedMimeTypeException.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/UnsupportedMimeTypeException.kt new file mode 100644 index 00000000..9f58bcba --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/UnsupportedMimeTypeException.kt @@ -0,0 +1,14 @@ +package com.fleeksoft.ksoup + +import okio.IOException + +/** + * Signals that a HTTP response returned a mime type that is not supported. + */ +class UnsupportedMimeTypeException(message: String?, val mimeType: String, val url: String) : + IOException(message) { + + override fun toString(): String { + return super.toString() + ". Mimetype=" + mimeType + ", URL=" + url + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/ChangeNotifyingArrayList.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/ChangeNotifyingArrayList.kt new file mode 100644 index 00000000..5f604d99 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/ChangeNotifyingArrayList.kt @@ -0,0 +1,81 @@ +package com.fleeksoft.ksoup.helper + +import com.fleeksoft.ksoup.removeRange + +/** + * Implementation of ArrayList that watches out for changes to the contents. + */ +abstract class ChangeNotifyingArrayList(initialCapacity: Int) : MutableList { + + private val delegate: MutableList = ArrayList(initialCapacity) + + abstract fun onContentsChanged() + + override fun set(index: Int, element: E): E { + onContentsChanged() + return delegate.set(index, element) + } + + override fun add(element: E): Boolean { + onContentsChanged() + return delegate.add(element) + } + + override fun add(index: Int, element: E) { + onContentsChanged() + delegate.add(index, element) + } + + override fun removeAt(index: Int): E { + onContentsChanged() + return delegate.removeAt(index) + } + + override fun remove(element: E): Boolean { + onContentsChanged() + return delegate.remove(element) + } + + override fun clear() { + onContentsChanged() + delegate.clear() + } + + override fun addAll(elements: Collection): Boolean { + onContentsChanged() + return delegate.addAll(elements) + } + + override fun addAll(index: Int, elements: Collection): Boolean { + onContentsChanged() + return delegate.addAll(index, elements) + } + + fun removeRange(fromIndex: Int, toIndex: Int) { + onContentsChanged() + (delegate as ArrayList).removeRange(fromIndex, toIndex) + } + + override fun removeAll(elements: Collection): Boolean { + onContentsChanged() + return delegate.removeAll(elements) + } + + override fun retainAll(elements: Collection): Boolean { + onContentsChanged() + return delegate.retainAll(elements) + } + + override val size: Int get() = delegate.size + override fun contains(element: E) = delegate.contains(element) + override fun containsAll(elements: Collection) = delegate.containsAll(elements) + override fun get(index: Int): E = delegate[index] + override fun indexOf(element: E): Int = delegate.indexOf(element) + override fun isEmpty(): Boolean = delegate.isEmpty() + override fun iterator(): MutableIterator = delegate.iterator() + override fun lastIndexOf(element: E): Int = delegate.lastIndexOf(element) + override fun listIterator(): MutableListIterator = delegate.listIterator() + override fun listIterator(index: Int): MutableListIterator = delegate.listIterator(index) + override fun subList(fromIndex: Int, toIndex: Int): MutableList = + delegate.subList(fromIndex, toIndex) +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/DataUtil.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/DataUtil.kt new file mode 100644 index 00000000..2330d20a --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/DataUtil.kt @@ -0,0 +1,415 @@ +package com.fleeksoft.ksoup.helper + +import com.fleeksoft.ksoup.UncheckedIOException +import com.fleeksoft.ksoup.internal.ConstrainableSource +import com.fleeksoft.ksoup.internal.Normalizer +import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.nodes.Comment +import com.fleeksoft.ksoup.nodes.Document +import com.fleeksoft.ksoup.nodes.Node +import com.fleeksoft.ksoup.nodes.XmlDeclaration +import com.fleeksoft.ksoup.parser.Parser +import com.fleeksoft.ksoup.ported.BufferReader +import com.fleeksoft.ksoup.ported.IllegalCharsetNameException +import com.fleeksoft.ksoup.ported.canEncode +import com.fleeksoft.ksoup.ported.isCharsetSupported +import com.fleeksoft.ksoup.readFile +import com.fleeksoft.ksoup.readGzipFile +import com.fleeksoft.ksoup.select.Elements +import io.ktor.utils.io.charsets.* +import io.ktor.utils.io.core.* +import okio.IOException +import okio.Path +import okio.use +import kotlin.math.min +import kotlin.random.Random + +/** + * Internal static utilities for handling data. + * + */ +object DataUtil { + private val charsetPattern: Regex = + Regex("(?i)\\bcharset=\\s*(?:[\"'])?([^\\s,;\"']*)") + val UTF_8: Charset = + Charsets.UTF_8 // Don't use StandardCharsets, as those only appear in Android API 19, and we target 10. + val defaultCharsetName: String = UTF_8.name // used if not found in header or meta charset + private const val firstReadBufferSize = 1024 * 5 + private const val bufferSize: Long = (1024 * 32).toLong() + private val mimeBoundaryChars = + "-_1234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ".toCharArray() + const val boundaryLength = 32 + + /** + * Loads and parses a file to a Document, with the HtmlParser. Files that are compressed with gzip (and end in `.gz` or `.z`) + * are supported in addition to uncompressed files. + * + * @param path file to load + * @param charsetName (optional) character set of input; specify `null` to attempt to autodetect. A BOM in + * the file will always override this setting. + * @param baseUri base URI of document, to resolve relative links against + * @return Document + * @throws IOException on IO error + */ + @Throws(IOException::class) + fun load(path: Path, charsetName: String?, baseUri: String): Document { + return load(path, charsetName, baseUri, Parser.htmlParser()) + } + + /** + * Loads and parses a file to a Document. Files that are compressed with gzip (and end in `.gz` or `.z`) + * are supported in addition to uncompressed files. + * + * @param filePath file to load + * @param charsetName (optional) character set of input; specify `null` to attempt to autodetect. A BOM in + * the file will always override this setting. + * @param baseUri base URI of document, to resolve relative links against + * @param parser alternate [parser][Parser.xmlParser] to use. + * + * @return Document + * @throws IOException on IO error + * @since 1.14.2 + */ + @Throws(IOException::class) + fun load( + filePath: Path, + charsetName: String?, + baseUri: String, + parser: Parser, + ): Document { + val name: String = Normalizer.lowerCase(filePath.name) +// todo:// handle gzip source + + val source = readFile(filePath) + return source.use { bufferedSource -> + val bufferReader: BufferReader = if (name.endsWith(".gz") || name.endsWith(".z")) { + val zipped: Boolean = runCatching { + bufferedSource.peek().use { peekSource -> +// In Kotlin, a Byte is signed and ranges from -128 to 127. In contrast, in Java, a byte is an unsigned type and ranges from 0 to 255. +// in kotlin use readUByte to get unsigned byte + peekSource.readByte().toUByte().toInt() == 0x1f && peekSource.readByte() + .toUByte().toInt() == 0x8b // gzip magic bytes 0x1f == 31 & 0x8b = 139 + } + }.getOrNull() ?: false + + if (zipped) { + BufferReader(readGzipFile(filePath).readByteArray()) + /*BufferReader( + GzipSource(Buffer().apply { write(bufferedSource.readByteArray()) }).buffer() + .readByteArray() + )*/ + } else { + BufferReader(bufferedSource.readByteArray()) + } + + } else { + BufferReader(bufferedSource.readByteArray()) + } + +// val charset = charsetName?.let { Charset.forName(it) } ?: Charsets.UTF_8 +// val inputData = bufferedSource.readString() + parseInputSource( + bufferReader, + charsetName, + baseUri, + parser, + ) // Assuming there's a method called parseInputString + } + } + + /** + * Parses a Document from an input steam. + * @param `in` input stream to parse. The stream will be closed after reading. + * @param charsetName character set of input (optional) + * @param baseUri base URI of document, to resolve relative links against + * @return Document + * @throws IOException on IO error + */ + @Throws(IOException::class) + fun load( + bufferReader: BufferReader, + charsetName: String?, + baseUri: String, + ): Document { + return parseInputSource(bufferReader, charsetName, baseUri, Parser.htmlParser()) + } + + /** + * Parses a Document from an input steam, using the provided Parser. + * @param `in` input stream to parse. The stream will be closed after reading. + * @param charsetName character set of input (optional) + * @param baseUri base URI of document, to resolve relative links against + * @param parser alternate [parser][Parser.xmlParser] to use. + * @return Document + * @throws IOException on IO error + */ + @Throws(IOException::class) + fun load( + bufferReader: BufferReader, + charsetName: String?, + baseUri: String, + parser: Parser, + ): Document { + return parseInputSource(bufferReader, charsetName, baseUri, parser) + } + + /** + * Writes the input stream to the output stream. Doesn't close them. + * @param `in` input stream to read from + * @param outSource output stream to write to + * @throws IOException on IO error + */ + @Throws(IOException::class) + fun crossStreams(source: ByteArray, outSource: okio.Buffer) { + outSource.write(source) + } + + @Throws(IOException::class) + fun parseInputSource( + bufferReader: BufferReader?, + charsetNameIn: String?, + baseUri: String, + parser: Parser, + ): Document { + if (bufferReader == null) { + // empty body + return Document(baseUri) + } + var charsetName: String? = charsetNameIn + + val inputReader = ConstrainableSource.wrap(bufferReader, 0) + + /*@Nullable */ + var doc: Document? = null + + // read the start of the stream and look for a BOM or meta charset + + inputReader.mark(bufferSize.toInt()) + // -1 because we read one more to see if completed. First read is < buffer size, so can't be invalid. + val firstBytes: BufferReader = readToByteBuffer(inputReader, firstReadBufferSize - 1) + val fullyRead = inputReader.fullyRead() + inputReader.reset() + + // look for BOM - overrides any other header or input + val bomCharset = detectCharsetFromBom(firstBytes) + if (bomCharset != null) charsetName = bomCharset.charset + if (charsetName == null) { // determine from meta. safe first parse as UTF-8 + doc = try { + /*val defaultDecoded: java.nio.CharBuffer = UTF_8.decode(firstBytes) + if (defaultDecoded.hasArray()) { + parser.parseInput( + java.io.CharArrayReader( + defaultDecoded.array(), + defaultDecoded.arrayOffset(), + defaultDecoded.limit(), + ), + baseUri, + ) + } else { + parser.parseInput(defaultDecoded.toString(), baseUri) + }*/ + parser.parseInput(firstBytes, baseUri) + } catch (e: UncheckedIOException) { + throw e + } + + // look for or HTML5 + val metaElements: Elements = + doc!!.select("meta[http-equiv=content-type], meta[charset]") + var foundCharset: String? = null // if not found, will keep utf-8 as best attempt + for (meta in metaElements) { + if (meta.hasAttr("http-equiv")) { + foundCharset = + getCharsetFromContentType(meta.attr("content")) + } + if (foundCharset == null && meta.hasAttr("charset")) { + foundCharset = + meta.attr("charset") + } + if (foundCharset != null) break + } + + // look for + if (foundCharset == null && doc.childNodeSize() > 0) { + val first: Node = doc.childNode(0) + var decl: XmlDeclaration? = null + if (first is XmlDeclaration) { + decl = first + } else if (first is Comment) { + val comment: Comment = first + if (comment.isXmlDeclaration()) decl = comment.asXmlDeclaration() + } + if (decl != null) { + if (decl.name().equals("xml", ignoreCase = true)) { + foundCharset = + decl.attr("encoding") + } + } + } + foundCharset = validateCharset(foundCharset) + if (foundCharset != null && !foundCharset.equals( + defaultCharsetName, + ignoreCase = true, + ) + ) { // need to re-decode. (case insensitive check here to match how validate works) + foundCharset = foundCharset.trim { it <= ' ' }.replace("[\"']".toRegex(), "") + charsetName = foundCharset + doc = null + } else if (!fullyRead) { + doc = null + } + } else { // specified by content type header (or by user on file load) + Validate.notEmpty( + charsetName, + "Must set charset arg to character set of file to parse. Set to null to attempt to detect from HTML", + ) + } + if (doc == null) { + if (charsetName == null) charsetName = defaultCharsetName + // TODO: bufferSize not used here because not supported yet + val reader = BufferReader( + String( + inputReader.readByteArray(), + charset = Charset.forName(charsetName) + ) + ) + if (bomCharset != null && bomCharset.offset) { // creating the buffered inputReader ignores the input pos, so must skip here +// skip first char which can be 2-4 + reader.skipFirstUnicodeChar(1) + } + doc = try { + parser.parseInput(reader, baseUri) + } catch (e: UncheckedIOException) { + // io exception when parsing (not seen before because reading the stream as we go) + throw e + } + val charset: Charset = + if (charsetName == defaultCharsetName) { + UTF_8 + } else { + Charset.forName( + charsetName, + ) + } + doc!!.outputSettings().charset(charset) + if (!charset.canEncode()) { + // some charsets can read but not encode; switch to an encodable charset and update the meta el + doc.charset(UTF_8) + } + } + + return doc + } + + /** + * Read the input stream into a byte buffer. To deal with slow input streams, you may interrupt the thread this + * method is executing on. The data read until being interrupted will be available. + * @param source the input stream to read from + * @param maxSize the maximum size in bytes to read from the stream. Set to 0 to be unlimited. + * @return the filled byte buffer + * @throws IOException if an exception occurs whilst reading from the input stream. + */ + @Throws(IOException::class) + fun readToByteBuffer(bufferReader: BufferReader, maxSize: Int): BufferReader { + Validate.isTrue(maxSize >= 0, "maxSize must be 0 (unlimited) or larger") + val input: ConstrainableSource = + ConstrainableSource.wrap( + bufferReader = bufferReader, + maxSize = min(maxSize.toLong(), bufferReader.getActiveBuffer().size).toInt() + ) + return input.readToByteBuffer(maxSize) + } + + fun emptyByteBuffer(): BufferReader { + return BufferReader() + } + + /** + * Parse out a charset from a content type header. If the charset is not supported, returns null (so the default + * will kick in.) + * @param contentType e.g. "text/html; charset=EUC-JP" + * @return "EUC-JP", or null if not found. Charset is trimmed and uppercased. + */ +// @Nullable + fun getCharsetFromContentType(/*@Nullable */contentType: String?): String? { + if (contentType == null) return null + val matchResult: MatchResult? = charsetPattern.find(contentType) + matchResult?.let { + var charset: String = it.groupValues[1].trim { it <= ' ' } + charset = charset.replace("charset=", "") + return validateCharset(charset) + } + return null + } + + // @Nullable + private fun validateCharset(/*@Nullable*/ cs: String?): String? { + var cs = cs + if (cs.isNullOrEmpty()) return null + cs = cs.trim { it <= ' ' }.replace("[\"']".toRegex(), "") + try { + if (cs.isCharsetSupported()) return cs + cs = cs.uppercase() + if (cs.isCharsetSupported()) return cs + } catch (e: IllegalCharsetNameException) { + // if our this charset matching fails.... we just take the default + } + return null + } + + /** + * Creates a random string, suitable for use as a mime boundary + */ + fun mimeBoundary(): String { + val mime: StringBuilder = StringUtil.borrowBuilder() + for (i in 0 until boundaryLength) { + mime.append(mimeBoundaryChars[Random.nextInt(mimeBoundaryChars.size)]) + } + return StringUtil.releaseBuilder(mime) + } + + // @Nullable + /*private fun detectCharsetFromBom(reader: Reader): BomCharset? { + val snapshot = okio.Buffer() + reader.copyTo(snapshot, 0, min(4, reader.size)) + val bom = snapshot.readByteArray() + if (bom[0].toInt() == 0x00 && bom[1].toInt() == 0x00 && bom[2] == 0xFE.toByte() && bom[3] == 0xFF.toByte() || // BE + bom[0] == 0xFF.toByte() && bom[1] == 0xFE.toByte() && bom[2].toInt() == 0x00 && bom[3].toInt() == 0x00 + ) { // LE + return BomCharset("UTF-32", false) // and I hope it's on your system + } else if (bom[0] == 0xFE.toByte() && bom[1] == 0xFF.toByte() || // BE + bom[0] == 0xFF.toByte() && bom[1] == 0xFE.toByte() + ) { + return BomCharset("UTF-16", false) // in all Javas + } else if (bom[0] == 0xEF.toByte() && bom[1] == 0xBB.toByte() && bom[2] == 0xBF.toByte()) { + return BomCharset("UTF-8", true) // in all Javas + // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here + } + return null + }*/ + + private fun detectCharsetFromBom(buffer: BufferReader): BomCharset? { + // .mark and rewind used to return Buffer, now ByteBuffer, so cast for backward compat + buffer.mark() + val bom = ByteArray(4) + if (buffer.remaining() >= bom.size) { + buffer[bom] + buffer.rewind() + } + if (bom[0].toInt() == 0x00 && bom[1].toInt() == 0x00 && bom[2] == 0xFE.toByte() && bom[3] == 0xFF.toByte() || // BE + bom[0] == 0xFF.toByte() && bom[1] == 0xFE.toByte() && bom[2].toInt() == 0x00 && bom[3].toInt() == 0x00 + ) { // LE + return BomCharset("UTF-32", false) // and I hope it's on your system + } else if (bom[0] == 0xFE.toByte() && bom[1] == 0xFF.toByte() || // BE + bom[0] == 0xFF.toByte() && bom[1] == 0xFE.toByte() + ) { + return BomCharset("UTF-16", false) // in all Javas + } else if (bom[0] == 0xEF.toByte() && bom[1] == 0xBB.toByte() && bom[2] == 0xBF.toByte()) { + return BomCharset("UTF-8", true) // in all Javas + // 16 and 32 decoders consume the BOM to determine be/le; utf-8 should be consumed here + } + return null + } + + private class BomCharset(val charset: String, val offset: Boolean) +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/NetworkHelper.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/NetworkHelper.kt new file mode 100644 index 00000000..eb6bb6a0 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/NetworkHelper.kt @@ -0,0 +1,46 @@ +package com.fleeksoft.ksoup.helper + +import io.ktor.client.HttpClient +import io.ktor.client.request.forms.submitForm +import io.ktor.client.request.get +import io.ktor.client.request.headers +import io.ktor.client.statement.HttpResponse +import io.ktor.http.parameters + +internal class NetworkHelper(private val client: HttpClient) { + + companion object { + val instance: NetworkHelper = NetworkHelper(HttpClient { }) + } + + suspend fun get( + url: String, + headers: Map = emptyMap(), + ): HttpResponse { + return client.get(url) { + headers { + headers.forEach { (key, value) -> + append(key, value) + } + } + } + } + + suspend fun post( + url: String, + params: Map, + headers: Map = emptyMap(), + ): HttpResponse { + return client.submitForm(url = url, formParameters = parameters { + params.forEach { (key, value) -> + append(key, value) + } + }) { + headers { + headers.forEach { (key, value) -> + append(key, value) + } + } + } + } +} \ No newline at end of file diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/Validate.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/Validate.kt new file mode 100644 index 00000000..e3f8c4ae --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/Validate.kt @@ -0,0 +1,186 @@ +package com.fleeksoft.ksoup.helper + +import kotlin.jvm.JvmOverloads + +/** + * Validators to check that method arguments meet expectations. + */ +object Validate { + /** + * Validates that the object is not null + * @param obj object to test + * @throws ValidationException if the object is null + */ + @Throws(ValidationException::class) + fun notNull(obj: Any?) { + if (obj == null) { + throw ValidationException(msg = "Object must not be null") + } + } + + /** + * Validates that the parameter is not null + * + * @param obj the parameter to test + * @param param the name of the parameter, for presentation in the validation exception. + * @throws ValidationException if the object is null + */ + @Throws(ValidationException::class) + fun notNullParam(obj: Any?, param: String?) { + if (obj == null) { + throw ValidationException("The parameter '$param' must not be null.") + } + } + + /** + * Validates that the object is not null + * @param obj object to test + * @param msg message to include in the Exception if validation fails + * @throws ValidationException if the object is null + */ + fun notNull(obj: Any?, msg: String?) { + if (obj == null) throw ValidationException(msg) + } + + /** + * Verifies the input object is not null, and returns that object. Effectively this casts a nullable object to a non- + * null object. (Works around lack of Objects.requestNonNull in Android version.) + * @param obj nullable object to case to not-null + * @return the object, or throws an exception if it is null + * @throws ValidationException if the object is null + */ + fun ensureNotNull(obj: Any?): Any { + return obj ?: throw ValidationException("Object must not be null") + } + + /** + * Verifies the input object is not null, and returns that object. Effectively this casts a nullable object to a non- + * null object. (Works around lack of Objects.requestNonNull in Android version.) + * @param obj nullable object to case to not-null + * @param msg the String format message to include in the validation exception when thrown + * @param args the arguments to the msg + * @return the object, or throws an exception if it is null + * @throws ValidationException if the object is null + */ + fun ensureNotNull(obj: Any?, msg: String?): Any { + return obj ?: throw ValidationException(msg) + } + + /** + * Validates that the value is true + * @param val object to test + * @throws ValidationException if the object is not true + */ + fun isTrue(value: Boolean) { + if (!value) throw ValidationException("Must be true") + } + + /** + * Validates that the value is true + * @param val object to test + * @param msg message to include in the Exception if validation fails + * @throws ValidationException if the object is not true + */ + fun isTrue(value: Boolean, msg: String?) { + if (!value) throw ValidationException(msg) + } + + /** + * Validates that the value is false + * @param val object to test + * @throws ValidationException if the object is not false + */ + fun isFalse(value: Boolean) { + if (value) throw ValidationException("Must be false") + } + + /** + * Validates that the value is false + * @param val object to test + * @param msg message to include in the Exception if validation fails + * @throws ValidationException if the object is not false + */ + fun isFalse(value: Boolean, msg: String?) { + if (value) throw ValidationException(msg) + } + + + /** + * Validates that the array contains no null elements + * @param objects the array to test + * @param msg message to include in the Exception if validation fails + * @throws ValidationException if the array contains a null element + */ + /** + * Validates that the array contains no null elements + * @param objects the array to test + * @throws ValidationException if the array contains a null element + */ + @JvmOverloads + fun noNullElements( + objects: Array, + msg: String? = "Array must not contain any null objects", + ) { + for (obj in objects) if (obj == null) throw ValidationException(msg) + } + + /** + * Validates that the string is not null and is not empty + * @param string the string to test + * @throws ValidationException if the string is null or empty + */ + fun notEmpty(string: String?) { + if (string.isNullOrEmpty()) throw ValidationException("String must not be empty") + } + + /** + * Validates that the string parameter is not null and is not empty + * @param string the string to test + * @param param the name of the parameter, for presentation in the validation exception. + * @throws ValidationException if the string is null or empty + */ + fun notEmptyParam(string: String?, param: String?) { + if (string.isNullOrEmpty()) { + throw ValidationException("The $param parameter must not be empty.") + } + } + + /** + * Validates that the string is not null and is not empty + * @param string the string to test + * @param msg message to include in the Exception if validation fails + * @throws ValidationException if the string is null or empty + */ + fun notEmpty(string: String?, msg: String?) { + if (string.isNullOrEmpty()) throw ValidationException(msg) + } + + /** + * Blow up if we reach an unexpected state. + * @param msg message to think about + * @throws IllegalStateException if we reach this state + */ + fun wtf(msg: String?) { + throw IllegalStateException(msg) + } + + /** + * Cause a failure. + * @param msg message to output. + * @throws IllegalStateException if we reach this state + */ + fun fail(msg: String?) { + throw ValidationException(msg) + } + + /** + * Cause a failure, but return false so it can be used in an assert statement. + * @param msg message to output. + * @return false, always + * @throws IllegalStateException if we reach this state + */ + fun assertFail(msg: String?): Boolean { + fail(msg) + return false + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/ValidationException.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/ValidationException.kt new file mode 100644 index 00000000..3d2a74cc --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/helper/ValidationException.kt @@ -0,0 +1,26 @@ +package com.fleeksoft.ksoup.helper + +/** + * Validation exceptions, as thrown by the methods in [Validate]. + */ +class ValidationException(msg: String?) : IllegalArgumentException(msg) { + // TODO: incomplete + /*@Synchronized + overrie fun fillInStackTrace(): Throwable { + // Filters out the Validate class from the stacktrace, to more clearly point at the root-cause. + super.fillInStackTrace() + val stackTrace: Array = getStackTrace() + val filteredTrace: MutableList = + ArrayList() + for (trace in stackTrace) { + if (trace.getClassName() == Validator) continue + filteredTrace.add(trace) + } + setStackTrace(filteredTrace.toTypedArray()) + return this + }*/ + + /*companion object { + val Validator: String = Validate::class.java.getName() + }*/ +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/ConstrainableSource.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/ConstrainableSource.kt new file mode 100644 index 00000000..6bb3e6f4 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/ConstrainableSource.kt @@ -0,0 +1,109 @@ +package com.fleeksoft.ksoup.internal + +/** + * A com.fleeksoft.ksoup internal class (so don't use it as there is no contract API) that enables constraints on an Input Stream, + * namely a maximum read size, and the ability to Thread.interrupt() the read. + */ +import okio.Buffer +import com.fleeksoft.ksoup.ported.BufferReader +import com.fleeksoft.ksoup.ported.System +import kotlin.math.min + +class ConstrainableSource( + bufferReader: BufferReader, + maxSize: Int +) : BufferReader(bufferReader, maxSize) { + + companion object { + private const val DEFAULT_SIZE = 1024 * 32 + + fun wrap(bufferReader: BufferReader, maxSize: Int): ConstrainableSource { + return if (bufferReader is ConstrainableSource) { + bufferReader + } else { + ConstrainableSource(bufferReader, maxSize) + } + } + } + + private val capped: Boolean = maxSize != 0 + private var startTime = System.nanoTime() + private var timeout: Long = 0 // optional max time of request + private var remaining = maxSize + private var interrupted = false + + init { + require(maxSize >= 0) { "maxSize must be 0 (unlimited) or larger" } + } + + fun fullyRead(): Boolean = this.exhausted() + + override fun read(sink: ByteArray, offset: Int, byteCount: Int): Int { + if (interrupted || capped && remaining <= 0) { + return -1 + } + if (expired()) { + throw Exception("Read timeout") + } + + val toRead = if (capped && byteCount > remaining) remaining else byteCount + + return try { + val read = getActiveBuffer().read( + sink = sink, + offset = 0, + byteCount = min(toRead, getActiveBuffer().size.toInt()) + ) + if (!this.exhausted()) { + remaining -= read + } + read + } catch (e: Exception) { + 0 + } + } + + fun readToByteBuffer(max: Int): BufferReader { + require(max >= 0) { "maxSize must be 0 (unlimited) or larger" } + val localCapped = max > 0 + val bufferSize = if (localCapped && max < DEFAULT_SIZE) max else DEFAULT_SIZE + + var read: Int + var remaining = max + + val buffer = Buffer() + + while (true) { + val size: Int = min(bufferSize, this.getActiveBuffer().size.toInt()) + val readBuffer = ByteArray(size) + read = this.read(readBuffer, 0, size) + if (read > 0) { + buffer.write(readBuffer, 0, read) + } + if (this.exhausted()) break + if (localCapped) { + if (read >= remaining) { + break + } + remaining -= read + } + + } + return BufferReader(buffer) + } + + fun timeout(startTimeNanos: Long, timeoutMillis: Long): ConstrainableSource { + this.startTime = startTimeNanos + this.timeout = timeoutMillis * 1_000_000 + return this + } + + private fun expired(): Boolean { + if (timeout == 0L) { + return false + } + val now = System.nanoTime() + val dur = now - startTime + return dur > timeout + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/Normalizer.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/Normalizer.kt new file mode 100644 index 00000000..c50ccc37 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/Normalizer.kt @@ -0,0 +1,21 @@ +package com.fleeksoft.ksoup.internal + +/** + * Util methods for normalizing strings. Jsoup internal use only, please don't depend on this API. + */ +object Normalizer { + /** Drops the input string to lower case. */ + fun lowerCase(input: String?): String { + return input?.lowercase() ?: "" + } + + /** Lower-cases and trims the input string. */ + fun normalize(input: String?): String { + return lowerCase(input).trim { it <= ' ' } + } + + /** If a string literal, just lower case the string; otherwise lower-case and trim. */ + fun normalize(input: String?, isStringLiteral: Boolean): String { + return if (isStringLiteral) lowerCase(input) else normalize(input) + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/StringUtil.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/StringUtil.kt new file mode 100644 index 00000000..90227536 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/internal/StringUtil.kt @@ -0,0 +1,420 @@ +package com.fleeksoft.ksoup.internal + +import com.fleeksoft.ksoup.helper.Validate +import com.fleeksoft.ksoup.ported.Character +import com.fleeksoft.ksoup.ported.appendRelativePath +import com.fleeksoft.ksoup.ported.isAbsResource +import com.fleeksoft.ksoup.ported.isValidResourceUrl +import de.cketti.codepoints.deluxe.CodePoint +import de.cketti.codepoints.deluxe.appendCodePoint +import de.cketti.codepoints.deluxe.codePointAt +import io.ktor.http.* +import kotlin.math.min + +/** + * A minimal String utility class. Designed for **internal** com.fleeksoft.ksoup use only - the API and outcome may change without + * notice. + */ +object StringUtil { + // memoised padding up to 21 (blocks 0 to 20 spaces) + val padding = arrayOf( + "", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + " ", + ) + + /** + * Join a collection of strings by a separator + * @param strings collection of string objects + * @param sep string to place between strings + * @return joined string + */ + fun join(strings: Collection<*>, sep: String?): String { + return join(strings.iterator(), sep) + } + + /** + * Join a collection of strings by a separator + * @param strings iterator of string objects + * @param sep string to place between strings + * @return joined string + */ + fun join(strings: Iterator<*>, sep: String?): String { + if (!strings.hasNext()) return "" + val start = strings.next().toString() + if (!strings.hasNext()) { + // only one, avoid builder + return start + } + val j = StringJoiner(sep) + j.add(start) + while (strings.hasNext()) { + j.add(strings.next()) + } + return j.complete() + } + + + /** + * Returns space padding, up to a max of maxPaddingWidth. + * @param width amount of padding desired + * @param maxPaddingWidth maximum padding to apply. Set to `-1` for unlimited. + * @return string of spaces * width + */ + /** + * Returns space padding (up to the default max of 30). Use [.padding] to specify a different limit. + * @param width amount of padding desired + * @return string of spaces * width + * @see .padding + */ + fun padding(width: Int, maxPaddingWidth: Int = 30): String { + var width = width + Validate.isTrue(width >= 0, "width must be >= 0") + Validate.isTrue(maxPaddingWidth >= -1) + if (maxPaddingWidth != -1) width = min(width, maxPaddingWidth) + if (width < padding.size) return padding[width] + val out = CharArray(width) + for (i in 0 until width) out[i] = ' ' + return out.concatToString() + } + + /** + * Tests if a string is blank: null, empty, or only whitespace (" ", \r\n, \t, etc) + * @param string string to test + * @return if string is blank + */ + fun isBlank(string: String?): Boolean { + if (string.isNullOrEmpty()) return true + val l = string.length + for (i in 0 until l) { + if (!isWhitespace(string.codePointAt(i).value)) return false + } + return true + } + + /** + * Tests if a string starts with a newline character + * @param string string to test + * @return if its first character is a newline + */ + fun startsWithNewline(string: String?): Boolean { + return if (string.isNullOrEmpty()) false else string[0] == '\n' + } + + /** + * Tests if a string is numeric, i.e. contains only digit characters + * @param string string to test + * @return true if only digit chars, false if empty or null or contains non-digit chars + */ + fun isNumeric(string: String?): Boolean { + if (string.isNullOrEmpty()) return false + val l = string.length + for (i in 0 until l) { + if (!Character.isDigit(string.codePointAt(i))) return false + } + return true + } + + /** + * Tests if a code point is "whitespace" as defined in the HTML spec. Used for output HTML. + * @param c code point to test + * @return true if code point is whitespace, false otherwise + * @see .isActuallyWhitespace + */ + fun isWhitespace(c: Int): Boolean { + return c == ' '.code || c == '\t'.code || c == '\n'.code || c == '\u000c'.code || c == '\r'.code + } + + /** + * Tests if a code point is "whitespace" as defined by what it looks like. Used for Element.text etc. + * @param c code point to test + * @return true if code point is whitespace, false otherwise + */ + fun isActuallyWhitespace(c: Int): Boolean { + return c == ' '.code || c == '\t'.code || c == '\n'.code || c == '\u000c'.code || c == '\r'.code || c == 160 + // 160 is   (non-breaking space). Not in the spec but expected. + } + + fun isInvisibleChar(c: Int): Boolean { + return c == 8203 || c == 173 // zero width sp, soft hyphen + // previously also included zw non join, zw join - but removing those breaks semantic meaning of text + } + + /** + * Normalise the whitespace within this string; multiple spaces collapse to a single, and all whitespace characters + * (e.g. newline, tab) convert to a simple space. + * @param string content to normalise + * @return normalised string + */ + fun normaliseWhitespace(string: String): String { + val sb: StringBuilder = borrowBuilder() + appendNormalisedWhitespace(sb, string, false) + return releaseBuilder(sb) + } + + /** + * After normalizing the whitespace within a string, appends it to a string builder. + * @param accum builder to append to + * @param string string to normalize whitespace within + * @param stripLeading set to true if you wish to remove any leading whitespace + */ + fun appendNormalisedWhitespace( + accum: StringBuilder, + string: String, + stripLeading: Boolean, + ) { + var lastWasWhite = false + var reachedNonWhite = false + val len = string.length + var c: CodePoint + var i = 0 + while (i < len) { + c = string.codePointAt(i) + if (isActuallyWhitespace(c.value)) { + if (stripLeading && !reachedNonWhite || lastWasWhite) { + i += c.charCount + continue + } + accum.append(' ') + lastWasWhite = true + } else if (!isInvisibleChar(c.value)) { + accum.appendCodePoint(c) + lastWasWhite = false + reachedNonWhite = true + } + i += c.charCount + } + } + + fun isIn(needle: String, vararg haystack: String): Boolean { + val len = haystack.size + for (i in 0 until len) { + if (haystack[i] == needle) return true + } + return false + } + + fun inSorted(needle: String, haystack: Array): Boolean { + return haystack.toList().binarySearch(needle) >= 0 + } + + /** + * Tests that a String contains only ASCII characters. + * @param string scanned string + * @return true if all characters are in range 0 - 127 + */ + fun isAscii(string: String): Boolean { + for (element in string) { + val c = element.code + if (c > 127) { // ascii range + return false + } + } + return true + } + + private val extraDotSegmentsPattern: Regex = Regex("^/((\\.{1,2}/)+)") +// private val extraDotSegmentsPatternJava: Pattern = Pattern.compile("^/((\\.{1,2}/)+)"); + + /** + * Create a new absolute URL, from a provided existing absolute URL and a relative URL component. + * @param base the existing absolute base URL + * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned) + * @return the resolved absolute URL + * @throws MalformedURLException if an error occurred generating the URL + */ + + // TODO: resolve alt + /*@Throws(java.net.MalformedURLException::class) + fun resolve(base: java.net.URL, relUrl: String): java.net.URL { + var relUrl = relUrl + relUrl = stripControlChars(relUrl) + // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired + if (relUrl.startsWith("?")) relUrl = base.getPath() + relUrl + // workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo + val url: java.net.URL = java.net.URL(base, relUrl) + var fixedFile: String = extraDotSegmentsPattern.matcher(url.getFile()).replaceFirst("/") + if (url.getRef() != null) { + fixedFile = fixedFile + "#" + url.getRef() + } + return java.net.URL(url.getProtocol(), url.getHost(), url.getPort(), fixedFile) + }*/ + + fun resolve(base: Url, relUrl: String): Url { + var relUrl = relUrl + relUrl = stripControlChars(relUrl) + // workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired +// if (relUrl.startsWith("?")) relUrl = base.fullPath + relUrl + // workaround: //example.com + ./foo = //example.com/./foo, not //example.com/foo + val url = URLBuilder(base).build() + + if (relUrl.isEmpty()) { + return url + } + + if (relUrl.isValidResourceUrl()) { + return URLBuilder(relUrl) + .apply { + if (relUrl.startsWith("//")) { + protocol = url.protocol + } + } + .build() + } + + return URLBuilder( + protocol = url.protocol, + host = url.host, + port = url.port, + pathSegments = url.pathSegments + ) + .appendRelativePath(relUrl) + .build() + var fixedFile: String = extraDotSegmentsPattern.replace(url.encodedPathAndQuery, "/") + /*if (url.ref != null) { + fixedFile = fixedFile + "#" + url.ref + }*/ + return URLBuilder( + protocol = url.protocol, + host = url.host, + port = url.port, + pathSegments = listOf(fixedFile) + ).build() + } + + /** + * Create a new absolute URL, from a provided existing absolute URL and a relative URL component. + * @param baseUrl the existing absolute base URL + * @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned) + * @return an absolute URL if one was able to be generated, or the empty string if not + */ + fun resolve(baseUrl: String?, relUrl: String?): String { + // workaround: java will allow control chars in a path URL and may treat as relative, but Chrome / Firefox will strip and may see as a scheme. Normalize to browser's view. + var baseUrl = baseUrl + var relUrl = relUrl + baseUrl = stripControlChars(baseUrl!!) + relUrl = stripControlChars(relUrl!!) + +// mailto, tel, geo, about etc.. + if (relUrl.isAbsResource()) { + return relUrl + } + return if (baseUrl.isValidResourceUrl()) { + resolve(Url(baseUrl), relUrl).toString() + } else if (relUrl.isValidResourceUrl()) { + Url(relUrl).toString() + } else { + if (validUriScheme.matches(relUrl)) relUrl else "" + } + } + + private val validUriScheme: Regex = "^[a-zA-Z][a-zA-Z0-9+-.]*:".toRegex() + private val controlChars: Regex = + Regex("[\\x00-\\x1f]*") // matches ascii 0 - 31, to strip from url + + private fun stripControlChars(input: String): String { + return input.replace(controlChars, "") + } + + private val stringLocalBuilders: ArrayDeque = ArrayDeque() + + /** + * Maintains cached StringBuilders in a flyweight pattern, to minimize new StringBuilder GCs. The StringBuilder is + * prevented from growing too large. + * + * + * Care must be taken to release the builder once its work has been completed, with [.releaseBuilder] + * @return an empty StringBuilder + */ + fun borrowBuilder(): StringBuilder { + return StringBuilder(MaxCachedBuilderSize) + } + + /** + * Release a borrowed builder. Care must be taken not to use the builder after it has been returned, as its + * contents may be changed by this method, or by a concurrent thread. + * @param sb the StringBuilder to release. + * @return the string value of the released String Builder (as an incentive to release it!). + */ + fun releaseBuilder(sb: StringBuilder): String { + var sb: StringBuilder = sb + val string: String = sb.toString() + if (sb.length > MaxCachedBuilderSize) { + sb = StringBuilder(MaxCachedBuilderSize) // make sure it hasn't grown too big + } else { + sb.clear() // make sure it's emptied on release + } + stringLocalBuilders.addLast(sb) + while (stringLocalBuilders.size > MaxIdleBuilders) { + stringLocalBuilders.removeLast() + } + return string + } + + private const val MaxCachedBuilderSize = 8 * 1024 + private const val MaxIdleBuilders = 8 + + /** + * A StringJoiner allows incremental / filtered joining of a set of stringable objects. + * @since 1.14.1 + */ + class StringJoiner + /** + * Create a new joiner, that uses the specified separator. MUST call [.complete] or will leak a thread + * local string builder. + * + * @param separator the token to insert between strings + */(val separator: String?) { + /*@Nullable*/ + // sets null on builder release so can't accidentally be reused + var sb: StringBuilder? = borrowBuilder() + var first = true + + /** + * Add another item to the joiner, will be separated + */ + fun add(stringy: Any?): StringJoiner { + Validate.notNull(sb) // don't reuse + if (!first) sb?.append(separator) + sb?.append(stringy) + first = false + return this + } + + /** + * Append content to the current item; not separated + */ + fun append(stringy: Any?): StringJoiner { + Validate.notNull(sb) // don't reuse + sb?.append(stringy) + return this + } + + /** + * Return the joined string, and release the builder back to the pool. This joiner cannot be reused. + */ + fun complete(): String { + val string = sb?.let { releaseBuilder(it) } + sb = null + return string ?: "" + } + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Attribute.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Attribute.kt new file mode 100644 index 00000000..4b121ad0 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Attribute.kt @@ -0,0 +1,332 @@ +package com.fleeksoft.ksoup.nodes + +import okio.IOException +import com.fleeksoft.ksoup.SerializationException +import com.fleeksoft.ksoup.helper.Validate +import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.nodes.Document.OutputSettings.Syntax +import com.fleeksoft.ksoup.ported.Cloneable + +/** + * A single key + value attribute. (Only used for presentation.) + */ +open class Attribute : Map.Entry, Cloneable { + private var attributeKey: String + + /*@Nullable*/ + private var attributeValue: String? + + /*@Nullable*/ + var parent: Attributes? + + /** + * Create a new attribute from unencoded (raw) key and value. + * @param key attribute key; case is preserved. + * @param value attribute value (may be null) + * @see .createFromEncoded + */ + constructor(key: String, /*@Nullable*/ value: String?) : this(key, value, null) + + /** + * Get the attribute key. + * @return the attribute key + */ + override val key: String + get() = attributeKey + + /** + * Set the attribute key; case is preserved. + * @param key the new key; must not be null + */ + fun setKey(key: String) { + var key = key + key = key.trim { it <= ' ' } + Validate.notEmpty(key) // trimming could potentially make empty, so validate here + if (parent != null) { + val i: Int = parent!!.indexOfKey(this.attributeKey) + if (i != Attributes.NotFound) parent!!.keys[i] = key + } + this.attributeKey = key + } + + override val value: String + /** + * Get the attribute value. Will return an empty string if the value is not set. + * @return the attribute value + */ + get() = Attributes.checkNotNull(attributeValue) + + /** + * Check if this Attribute has a value. Set boolean attributes have no value. + * @return if this is a boolean attribute / attribute without a value + */ + fun hasDeclaredValue(): Boolean { + return attributeValue != null + } + + /** + * Set the attribute value. + * @param newValue the new attribute value; may be null (to set an enabled boolean attribute) + * @return the previous value (if was null; an empty string) + */ + fun setValue(/*@Nullable*/ newValue: String?): String? { + var oldVal = this.attributeValue + if (parent != null) { + val i: Int = parent!!.indexOfKey(attributeKey) + if (i != Attributes.NotFound) { + oldVal = parent!![attributeKey] // trust the container more + parent!!.vals[i] = newValue + } + } + this.attributeValue = newValue + return Attributes.checkNotNull(oldVal) + } + + /** + * Get the HTML representation of this attribute; e.g. `href="index.html"`. + * @return HTML + */ + fun html(): String { + val sb: StringBuilder = StringUtil.borrowBuilder() + try { + html(sb, Document("").outputSettings()) + } catch (exception: IOException) { + throw SerializationException(exception) + } + return StringUtil.releaseBuilder(sb) + } + + @Throws(IOException::class) + protected fun html(accum: Appendable, out: Document.OutputSettings) { + html(attributeKey, attributeValue, accum, out) + } + + /** + * Create a new attribute from unencoded (raw) key and value. + * @param key attribute key; case is preserved. + * @param val attribute value (may be null) + * @param parent the containing Attributes (this Attribute is not automatically added to said Attributes) + * @see .createFromEncoded + */ + constructor( + key: String, /*@Nullable*/ + value: String?, /*@Nullable*/ + parent: Attributes?, + ) { + var key = key + key = key.trim { it <= ' ' } + Validate.notEmpty(key) // trimming could potentially make empty, so validate here + this.attributeKey = key + this.attributeValue = value + this.parent = parent + } + + /** + * Get the string representation of this attribute, implemented as [.html]. + * @return string + */ + override fun toString(): String { + return html() + } + + fun isDataAttribute(): Boolean = isDataAttribute(attributeKey) + + /** + * Collapsible if it's a boolean attribute and value is empty or same as name + * + * @param out output settings + * @return Returns whether collapsible or not + */ + protected fun shouldCollapseAttribute(out: Document.OutputSettings): Boolean { + return shouldCollapseAttribute(attributeKey, attributeValue, out) + } + + override fun equals(/*@Nullable*/ o: Any?): Boolean { // note parent not considered + if (this === o) return true + if (o == null || this::class != o::class) return false + val attribute: Attribute = o as Attribute + if (attributeKey != attribute.attributeKey) return false + return if (attributeValue != null) attributeValue == attribute.attributeValue else attribute.attributeValue == null + } + + override fun hashCode(): Int { // note parent not considered + var result = attributeKey.hashCode() + result = 31 * result + if (attributeValue != null) attributeValue.hashCode() else 0 + return result + } + + override fun clone(): Attribute { + val attribute = Attribute(attributeKey, attributeValue) + attribute.parent = this.parent + return attribute + } + + companion object { + private val booleanAttributes = arrayOf( + "allowfullscreen", + "async", + "autofocus", + "checked", + "compact", + "declare", + "default", + "defer", + "disabled", + "formnovalidate", + "hidden", + "inert", + "ismap", + "itemscope", + "multiple", + "muted", + "nohref", + "noresize", + "noshade", + "novalidate", + "nowrap", + "open", + "readonly", + "required", + "reversed", + "seamless", + "selected", + "sortable", + "truespeed", + "typemustmatch", + ) + + @Throws(IOException::class) + protected fun html( + key: String, + /*@Nullable*/ + value: String?, + accum: Appendable, + out: Document.OutputSettings, + ) { + val resultKey: String = getValidKey(key, out.syntax()) ?: return // can't write it :( + htmlNoValidate(resultKey, value, accum, out) + } + + @Throws(IOException::class) + fun htmlNoValidate( + key: String, + /*@Nullable*/ + value: String?, + accum: Appendable, + out: Document.OutputSettings, + ) { + // structured like this so that Attributes can check we can write first, so it can add whitespace correctly + accum.append(key) + if (!shouldCollapseAttribute(key, value, out)) { + accum.append("=\"") + Entities.escape( + accum, + Attributes.checkNotNull(value), + out, + inAttribute = true, + normaliseWhite = false, + stripLeadingWhite = false, + trimTrailing = false, + ) + accum.append('"') + } + } + + private val xmlKeyValid: Regex = + Regex("[a-zA-Z_:][-a-zA-Z0-9_:.]*") + private val xmlKeyReplace: Regex = + Regex("[^-a-zA-Z0-9_:.]") + private val htmlKeyValid: Regex = + Regex("[^\\x00-\\x1f\\x7f-\\x9f \"'/=]+") + private val htmlKeyReplace: Regex = + Regex("[\\x00-\\x1f\\x7f-\\x9f \"'/=]") + + fun getValidKey(key: String, syntax: Syntax): String? { + return when (syntax) { + Syntax.xml -> { + if (!xmlKeyValid.matches(key)) { + val newKey = xmlKeyReplace.replace(key, "") + if (xmlKeyValid.matches(newKey)) newKey else null + } else { + key + } + } + + Syntax.html -> { + if (!htmlKeyValid.matches(key)) { + val newKey = htmlKeyReplace.replace(key, "") + if (htmlKeyValid.matches(newKey)) newKey else null + } else { + key + } + } + } + } + + /*@Nullable*/ + /*fun getValidKey(key: String?, syntax: Syntax): String? { + // we consider HTML attributes to always be valid. XML checks key validity + var key = key + if (syntax === Syntax.xml && !xmlKeyValid.matcher(key).matches()) { + key = xmlKeyReplace.matcher(key).replaceAll("") + return if (xmlKeyValid.matcher(key) + .matches() + ) { + key + } else { + null // null if could not be coerced + } + } else if (syntax === Syntax.html && !htmlKeyValid.matcher(key).matches()) { + key = htmlKeyReplace.matcher(key).replaceAll("") + return if (htmlKeyValid.matcher(key) + .matches() + ) { + key + } else { + null // null if could not be coerced + } + } + return key + }*/ + + /** + * Create a new Attribute from an unencoded key and a HTML attribute encoded value. + * @param unencodedKey assumes the key is not encoded, as can be only run of simple \w chars. + * @param encodedValue HTML attribute encoded value + * @return attribute + */ + fun createFromEncoded(unencodedKey: String, encodedValue: String): Attribute { + val value: String = Entities.unescape(encodedValue, true) + return Attribute(unencodedKey, value, null) // parent will get set when Put + } + + protected fun isDataAttribute(key: String): Boolean { + return key.startsWith(Attributes.dataPrefix) && key.length > Attributes.dataPrefix.length + } + + // collapse unknown foo=null, known checked=null, checked="", checked=checked; write out others + protected fun shouldCollapseAttribute( + key: String, + /*@Nullable*/ + value: String?, + out: Document.OutputSettings, + ): Boolean { + return out.syntax() === Syntax.html && + ( + value == null || ( + value.isEmpty() || value.equals( + key, + ignoreCase = true, + ) + ) && isBooleanAttribute(key) + ) + } + + /** + * Checks if this attribute name is defined as a boolean attribute in HTML5 + */ + fun isBooleanAttribute(key: String): Boolean { + return booleanAttributes.toList().binarySearch { it.compareTo(key.lowercase()) } >= 0 + } + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Attributes.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Attributes.kt new file mode 100644 index 00000000..489191c4 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Attributes.kt @@ -0,0 +1,592 @@ +package com.fleeksoft.ksoup.nodes + +import okio.IOException +import com.fleeksoft.ksoup.SerializationException +import com.fleeksoft.ksoup.helper.Validate +import com.fleeksoft.ksoup.internal.Normalizer.lowerCase +import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.parser.ParseSettings +import com.fleeksoft.ksoup.ported.Cloneable +import com.fleeksoft.ksoup.ported.Collections + +/** + * The attributes of an Element. + * + * + * Attributes are treated as a map: there can be only one value associated with an attribute key/name. + * + * + * + * Attribute name and value comparisons are generally **case sensitive**. By default for HTML, attribute names are + * normalized to lower-case on parsing. That means you should use lower-case strings when referring to attributes by + * name. + * + * + * @author Sabeeh, fleeksoft@gmail.com + */ +class Attributes : Iterable, Cloneable { + // the number of instance fields is kept as low as possible giving an object size of 24 bytes + private var size = 0 // number of slots used (not total capacity, which is keys.length) + var keys: Array = arrayOfNulls(InitialCapacity) + var vals = + arrayOfNulls(InitialCapacity) // Genericish: all non-internal attribute values must be Strings and are cast on access. + + // check there's room for more + private fun checkCapacity(minNewSize: Int) { + Validate.isTrue(minNewSize >= size) + val curCap = keys.size + if (curCap >= minNewSize) return + var newCap = if (curCap >= InitialCapacity) size * GrowthFactor else InitialCapacity + if (minNewSize > newCap) newCap = minNewSize + keys = keys.copyOf(newCap) + vals = vals.copyOf(newCap) + } + + fun indexOfKey(key: String): Int { + for (i in 0 until size) { + if (key == keys[i]) return i + } + return NotFound + } + + private fun indexOfKeyIgnoreCase(key: String): Int { + Validate.notNull(key) + for (i in 0 until size) { + if (key.equals(keys[i], ignoreCase = true)) return i + } + return NotFound + } + + /** + * Get an attribute value by key. + * @param key the (case-sensitive) attribute key + * @return the attribute value if set; or empty string if not set (or a boolean attribute). + * @see .hasKey + */ + operator fun get(key: String): String { + val i = indexOfKey(key) + return if (i == NotFound) { + EmptyString + } else { + checkNotNull( + vals[i], + ) + } + } + + /** + * Get an attribute's value by case-insensitive key + * @param key the attribute name + * @return the first matching attribute value if set; or empty string if not set (ora boolean attribute). + */ + fun getIgnoreCase(key: String): String { + val i = indexOfKeyIgnoreCase(key) + return if (i == NotFound) { + EmptyString + } else { + checkNotNull( + vals[i], + ) + } + } + + /** + * Get an arbitrary user data object by key. + * @param key case sensitive key to the object. + * @return the object associated to this key, or `null` if not found. + */ +// @Nullable + fun getUserData(key: String): Any? { + var key = key + Validate.notNull(key) + if (!isInternalKey(key)) key = internalKey(key) + val i = indexOfKeyIgnoreCase(key) + return if (i == NotFound) null else vals[i] + } + + /** + * Adds a new attribute. Will produce duplicates if the key already exists. + * @see Attributes.put + */ + fun add(key: String, /*@Nullable*/ value: String?): Attributes { + addObject(key, value) + return this + } + + private fun addObject(key: String, /*@Nullable*/ value: Any?) { + checkCapacity(size + 1) + keys[size] = key + vals[size] = value + size++ + } + + /** + * Set a new attribute, or replace an existing one by key. + * @param key case sensitive attribute key (not null) + * @param value attribute value (may be null, to set a boolean attribute) + * @return these attributes, for chaining + */ + fun put(key: String, /*@Nullable*/ value: String?): Attributes { + Validate.notNull(key) + val i = indexOfKey(key) + if (i != NotFound) vals[i] = value else add(key, value) + return this + } + + /** + * Put an arbitrary user-data object by key. Will be treated as an internal attribute, so will not be emitted in HTML. + * @param key case sensitive key + * @param value object value + * @return these attributes + * @see .getUserData + */ + fun putUserData(key: String, value: Any?): Attributes { + var key = key + Validate.notNull(key) + if (!isInternalKey(key)) key = internalKey(key) + Validate.notNull(value) + val i = indexOfKey(key) + if (i != NotFound) vals[i] = value else addObject(key, value) + return this + } + + fun putIgnoreCase(key: String, /*@Nullable*/ value: String?) { + val i = indexOfKeyIgnoreCase(key) + if (i != NotFound) { + vals[i] = value + if (keys[i] != key) { + // case changed, update + keys[i] = key + } + } else { + add(key, value) + } + } + + /** + * Set a new boolean attribute, remove attribute if value is false. + * @param key case **insensitive** attribute key + * @param value attribute value + * @return these attributes, for chaining + */ + fun put(key: String, value: Boolean): Attributes { + if (value) putIgnoreCase(key, null) else remove(key) + return this + } + + /** + * Set a new attribute, or replace an existing one by key. + * @param attribute attribute with case sensitive key + * @return these attributes, for chaining + */ + fun put(attribute: Attribute): Attributes { + Validate.notNull(attribute) + put(attribute.key, attribute.value) + attribute.parent = this + return this + } + + // removes and shifts up + private fun remove(index: Int) { + Validate.isFalse(index >= size) + val shifted = size - index - 1 + if (shifted > 0) { + keys.copyInto( + destination = keys, + destinationOffset = index, + startIndex = index + 1, + endIndex = index + 1 + shifted, + ) + vals.copyInto( + destination = vals, + destinationOffset = index, + startIndex = index + 1, + endIndex = index + 1 + shifted, + ) + } + size-- + keys[size] = null // release hold + vals[size] = null + } + + /** + * Remove an attribute by key. **Case sensitive.** + * @param key attribute key to remove + */ + fun remove(key: String) { + val i = indexOfKey(key) + if (i != NotFound) remove(i) + } + + /** + * Remove an attribute by key. **Case insensitive.** + * @param key attribute key to remove + */ + fun removeIgnoreCase(key: String) { + val i = indexOfKeyIgnoreCase(key) + if (i != NotFound) remove(i) + } + + /** + * Tests if these attributes contain an attribute with this key. + * @param key case-sensitive key to check for + * @return true if key exists, false otherwise + */ + fun hasKey(key: String): Boolean { + return indexOfKey(key) != NotFound + } + + /** + * Tests if these attributes contain an attribute with this key. + * @param key key to check for + * @return true if key exists, false otherwise + */ + fun hasKeyIgnoreCase(key: String): Boolean { + return indexOfKeyIgnoreCase(key) != NotFound + } + + /** + * Check if these attributes contain an attribute with a value for this key. + * @param key key to check for + * @return true if key exists, and it has a value + */ + fun hasDeclaredValueForKey(key: String): Boolean { + val i = indexOfKey(key) + return i != NotFound && vals[i] != null + } + + /** + * Check if these attributes contain an attribute with a value for this key. + * @param key case-insensitive key to check for + * @return true if key exists, and it has a value + */ + fun hasDeclaredValueForKeyIgnoreCase(key: String): Boolean { + val i = indexOfKeyIgnoreCase(key) + return i != NotFound && vals[i] != null + } + + /** + * Get the number of attributes in this set, including any com.fleeksoft.ksoup internal-only attributes. Internal attributes are + * excluded from the [.html], [.asList], and [.iterator] methods. + * @return size + */ + fun size(): Int { + return size + } + + fun isEmpty(): Boolean = size == 0 + + /** + * Add all the attributes from the incoming set to this set. + * @param incoming attributes to add to these attributes. + */ + fun addAll(incoming: Attributes) { + if (incoming.size() == 0) return + checkCapacity(size + incoming.size) + val needsPut = + size != 0 // if this set is empty, no need to check existing set, so can add() vs put() + // (and save bashing on the indexOfKey() + for (attr in incoming) { + if (needsPut) put(attr) else add(attr.key, attr.value) + } + } + + override fun iterator(): MutableIterator { + return object : MutableIterator { + var expectedSize = size + var i = 0 + override fun hasNext(): Boolean { + checkModified() + while (i < size) { + if (isInternalKey(keys[i])) { + // skip over internal keys + i++ + } else { + break + } + } + return i < size + } + + override fun next(): Attribute { + checkModified() + val attr = Attribute(keys[i]!!, vals[i] as String?, this@Attributes) + i++ + return attr + } + + private fun checkModified() { + if (size != expectedSize) throw ConcurrentModificationException("Use Iterator#remove() instead to remove attributes while iterating.") + } + + override fun remove() { + this@Attributes.remove(--i) // next() advanced, so rewind + expectedSize-- + } + } + } + + /** + * Get the attributes as a List, for iteration. + * @return a view of the attributes as an unmodifiable List. + */ + fun asList(): List { + val list: ArrayList = ArrayList(size) + for (i in 0 until size) { + if (isInternalKey(keys[i])) continue // skip internal keys + val attr = Attribute(keys[i]!!, vals[i] as String?, this@Attributes) + list.add(attr) + } + return Collections.unmodifiableList(list) + } + + /** + * Retrieves a filtered view of attributes that are HTML5 custom data attributes; that is, attributes with keys + * starting with `data-`. + * @return map of custom data attributes. + */ + fun dataset(): Dataset { + return Dataset(this) + } + + /** + * Get the HTML representation of these attributes. + * @return HTML + */ + fun html(): String { + val sb: StringBuilder = StringUtil.borrowBuilder() + try { + html( + sb, + Document("").outputSettings(), + ) // output settings a bit funky, but this html() seldom used + } catch (e: IOException) { // ought never happen + throw SerializationException(e) + } + return StringUtil.releaseBuilder(sb) + } + + @Throws(IOException::class) + fun html(accum: Appendable, out: Document.OutputSettings) { + val sz = size + for (i in 0 until sz) { + if (isInternalKey(keys[i])) continue + val key: String? = keys[i]?.let { Attribute.getValidKey(it, out.syntax()) } + if (key != null) { + Attribute.htmlNoValidate( + key, + vals[i] as String?, + accum.append(' '), + out, + ) + } + } + } + + override fun toString(): String { + return html() + } + + /** + * Checks if these attributes are equal to another set of attributes, by comparing the two sets. Note that the order + * of the attributes does not impact this equality (as per the Map interface equals()). + * @param o attributes to compare with + * @return if both sets of attributes have the same content + */ + override fun equals(/*@Nullable*/ o: Any?): Boolean { + if (this === o) return true + if (o == null || this::class != o::class) return false + val that = o as Attributes + if (size != that.size) return false + for (i in 0 until size) { + val key = keys[i]!! + val thatI = that.indexOfKey(key) + if (thatI == NotFound) return false + val value = vals[i] + val thatVal = that.vals[thatI] + if (value == null) { + if (thatVal != null) return false + } else if (value != thatVal) return false + } + return true + } + + /** + * Calculates the hashcode of these attributes, by iterating all attributes and summing their hashcodes. + * @return calculated hashcode + */ + override fun hashCode(): Int { + var result = size + result = 31 * result + keys.hashCode() + result = 31 * result + vals.hashCode() + return result + } + + override fun clone(): Attributes { + val attributes = Attributes() + attributes.addAll(this) + + attributes.size = size + + attributes.keys = keys.copyOf(size) + attributes.vals = vals.copyOf(size) + return attributes + } + + /** + * Internal method. Lowercases all keys. + */ + fun normalize() { + for (i in 0 until size) { + keys[i] = lowerCase(keys[i]) + } + } + + /** + * Internal method. Removes duplicate attribute by name. Settings for case sensitivity of key names. + * @param settings case sensitivity + * @return number of removed dupes + */ + fun deduplicate(settings: ParseSettings): Int { + if (isEmpty()) return 0 + val preserve: Boolean = settings.preserveAttributeCase() + var dupes = 0 + OUTER@ for (i in keys.indices) { + var j = i + 1 + while (j < keys.size) { + if (keys[j] == null) continue@OUTER // keys.length doesn't shrink when removing, so re-test + if (preserve && keys[i] == keys[j] || !preserve && keys[i].equals( + keys[j], + ignoreCase = true, + ) + ) { + dupes++ + remove(j) + j-- + } + j++ + } + } + return dupes + } + + class Dataset(private val attributes: Attributes) { + val size: Int + get() = attributes.count { it.isDataAttribute() } + + operator fun set(key: String, value: String): String? { + val dataKey = dataKey(key) + val oldValue = if (attributes.hasKey(dataKey)) attributes[dataKey] else null + attributes.put(dataKey, value) + return oldValue + } + + operator fun get(key: String): String? { + val dataKey = "$dataPrefix$key" + return if (dataKey.length > dataPrefix.length && attributes.hasKey(dataKey)) { + attributes[dataKey] + } else { + null + } + } + + fun remove(key: String) { + attributes.remove("$dataPrefix$key") + } + } + + class Dataset3(private val attributes: Attributes) : AbstractMap() { + + override val entries: Set> + get() = EntrySet() + + operator fun set(key: String, value: String): String? { + val dataKey = dataKey(key) + val oldValue = if (attributes.hasKey(dataKey)) attributes[dataKey] else null + attributes.put(dataKey, value) + return oldValue + } + + fun remove(key: String) { + attributes.remove(key) + } + + private inner class EntrySet : AbstractSet>() { + override val size: Int + get() { + var count = 0 + val iter = DatasetIterator() + while (iter.hasNext()) { + count++ + iter.next() // Ensure the iterator advances + } + return count + } + + override fun iterator(): Iterator> { + return DatasetIterator() + } + } + + private inner class DatasetIterator : Iterator> { + private val attrIter: Iterator = attributes.iterator() + private var attr: Attribute? = null + + override fun hasNext(): Boolean { + while (attrIter.hasNext()) { + val nextAttr = attrIter.next() + if (nextAttr.isDataAttribute()) { + attr = nextAttr + return true + } + } + return false + } + + override fun next(): Map.Entry { + val currentAttr = attr ?: throw NoSuchElementException() + return Attribute(currentAttr.key.substring(dataPrefix.length), currentAttr.value) + } + + fun remove() { + val currentAttr = attr ?: throw IllegalStateException() + attributes.remove(currentAttr.key) + } + } + } + + private fun isInternalKey(key: String?): Boolean { + return key != null && key.length > 1 && key[0] == InternalPrefix + } + + companion object { + // The Attributes object is only created on the first use of an attribute; the Element will just have a null + // Attribute slot otherwise + const val dataPrefix = "data-" + + // Indicates a com.fleeksoft.ksoup internal key. Can't be set via HTML. (It could be set via accessor, but not too worried about + // that. Suppressed from list, iter. + const val InternalPrefix = '/' + private const val InitialCapacity = + 3 // sampling found mean count when attrs present = 1.49; 1.08 overall. 2.6:1 don't have any attrs. + + // manages the key/val arrays + private const val GrowthFactor = 2 + const val NotFound = -1 + private const val EmptyString = "" + + // we track boolean attributes as null in values - they're just keys. so returns empty for consumers + // casts to String, so only for non-internal attributes + fun checkNotNull(/*@Nullable*/ value: Any?): String { + return if (value == null) EmptyString else (value as String?)!! + } + + private fun dataKey(key: String): String { + return dataPrefix + key + } + + fun internalKey(key: String): String { + return InternalPrefix.toString() + key + } + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/CDataNode.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/CDataNode.kt new file mode 100644 index 00000000..2bc4f68e --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/CDataNode.kt @@ -0,0 +1,44 @@ +package com.fleeksoft.ksoup.nodes + +import okio.IOException + +/** + * A Character Data node, to support CDATA sections. + */ +class CDataNode(text: String?) : TextNode(text!!) { + override fun nodeName(): String { + return "#cdata" + } + + /** + * Get the unencoded, **non-normalized** text content of this CDataNode. + * @return unencoded, non-normalized text + */ + override fun text(): String { + return getWholeText() + } + + @Throws(IOException::class) + override fun outerHtmlHead( + accum: Appendable, + depth: Int, + out: Document.OutputSettings, + ) { + accum + .append("") + } + + override fun clone(): CDataNode { + return this.clone() as CDataNode + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Comment.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Comment.kt new file mode 100644 index 00000000..ccfa8e2b --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Comment.kt @@ -0,0 +1,100 @@ +package com.fleeksoft.ksoup.nodes + +import com.fleeksoft.ksoup.parser.ParseSettings +import com.fleeksoft.ksoup.parser.Parser + +/** + * A comment node. + * + * @author Sabeeh, fleeksoft@gmail.com + */ +class Comment(data: String) : LeafNode() { + /** + * Create a new comment node. + * @param data The contents of the comment + */ + init { + value = data + } + + override fun nodeName(): String { + return "#comment" + } + + fun getData(): String = coreValue() + + fun setData(data: String?): Comment { + coreValue(data) + return this + } + + override fun outerHtmlHead(accum: Appendable, depth: Int, out: Document.OutputSettings) { + if (out.prettyPrint() && ( + isEffectivelyFirst() && _parentNode is Element && (_parentNode as Element).tag() + .formatAsBlock() || out.outline() + ) + ) { + indent(accum, depth, out) + } + accum + .append("") + } + + override fun outerHtmlTail( + accum: Appendable, + depth: Int, + out: Document.OutputSettings, + ) { + } + + override fun toString(): String { + return outerHtml() + } + + override fun createClone(): Node { + return Comment(value as String) + } + + override fun clone(): Comment { + return super.clone() as Comment + } + + fun isXmlDeclaration(): Boolean { + val data = getData() + return isXmlDeclarationData(data) + } + + /** + * Attempt to cast this comment to an XML Declaration node. + * @return an XML declaration if it could be parsed as one, null otherwise. + */ + /*@Nullable*/ + fun asXmlDeclaration(): XmlDeclaration? { + val data = getData() + var decl: XmlDeclaration? = null + val declContent = data.substring(1, data.length - 1) + // make sure this bogus comment is not immediately followed by another, treat as comment if so + if (isXmlDeclarationData(declContent)) return null + val fragment = "<$declContent>" + // use the HTML parser not XML, so we don't get into a recursive XML Declaration on contrived data + val doc: Document = + Parser.htmlParser().settings(ParseSettings.preserveCase).parseInput(fragment, baseUri()) + if (doc.body().childrenSize() > 0) { + val el: Element = doc.body().child(0) + decl = XmlDeclaration( + NodeUtils.parser(doc)!!.settings()!!.normalizeTag(el.tagName()), + data.startsWith("!"), + ) + decl.attributes().addAll(el.attributes()) + } + return decl + } + + companion object { + private fun isXmlDeclarationData(data: String): Boolean { + return data.length > 1 && (data.startsWith("!") || data.startsWith("?")) + } + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/DataNode.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/DataNode.kt new file mode 100644 index 00000000..a218ce10 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/DataNode.kt @@ -0,0 +1,67 @@ +package com.fleeksoft.ksoup.nodes + +import okio.IOException + +/** + * A data node, for contents of style, script tags etc, where contents should not show in text(). + * + * @author Sabeeh, fleeksoft@gmail.com + */ +class DataNode(data: String) : LeafNode() { + /** + * Create a new DataNode. + * @param data data contents + */ + init { + value = data + } + + override fun nodeName(): String { + return "#data" + } + + fun getWholeData(): String = coreValue() + + /** + * Set the data contents of this node. + * @param data unencoded data + * @return this node, for chaining + */ + fun setWholeData(data: String?): DataNode { + coreValue(data) + return this + } + + @Throws(IOException::class) + override fun outerHtmlHead(accum: Appendable, depth: Int, out: Document.OutputSettings) { + if (out.syntax() == Document.OutputSettings.Syntax.xml) { + // In XML mode, output data nodes as CDATA, so can parse as XML + accum + .append("") + } else { + // In HTML, data is not escaped in return from data nodes, so " in script, style is plain + accum.append(getWholeData()) + } + } + + override fun outerHtmlTail( + accum: Appendable, + depth: Int, + out: Document.OutputSettings, + ) { + } + + override fun toString(): String { + return outerHtml() + } + + override fun createClone(): Node { + return DataNode(value as String) + } + + override fun clone(): DataNode { + return super.clone() as DataNode + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Document.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Document.kt new file mode 100644 index 00000000..7d65d25f --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Document.kt @@ -0,0 +1,649 @@ +package com.fleeksoft.ksoup.nodes + +import io.ktor.utils.io.charsets.Charset +import io.ktor.utils.io.charsets.CharsetEncoder +import io.ktor.utils.io.charsets.name +import com.fleeksoft.ksoup.helper.DataUtil +import com.fleeksoft.ksoup.helper.Validate +import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.parser.ParseSettings +import com.fleeksoft.ksoup.parser.Parser +import com.fleeksoft.ksoup.parser.Tag +import com.fleeksoft.ksoup.select.Elements +import com.fleeksoft.ksoup.select.Evaluator +import com.fleeksoft.ksoup.select.Selector +import com.fleeksoft.ksoup.ported.Cloneable + +/** + * A HTML Document. + * + * @author Sabeeh, fleeksoft@gmail.com + */ +class Document(private val namespace: String, private val location: String?) : + Element(Tag.valueOf("#root", namespace, ParseSettings.htmlDefault), location) { + // private var connection: Connection? = null // the connection this doc was fetched from, if any + private var outputSettings = OutputSettings() + private var parser: Parser? + private var quirksMode = QuirksMode.noQuirks + private var updateMetaCharset = false + + /** + * Create a new, empty Document, in the HTML namespace. + * @param baseUri base URI of document + * @see com.fleeksoft.ksoup.Ksoup.parseFile + * @see .Document + */ + constructor(baseUri: String?) : this(Parser.NamespaceHtml, baseUri) + + /** + * Get the URL this Document was parsed from. If the starting URL is a redirect, + * this will return the final URL from which the document was served from. + * + * Will return an empty string if the location is unknown (e.g. if parsed from a String). + * @return location + */ + fun location(): String? { + return location + } + + /** + * Returns the Connection (Request/Response) object that was used to fetch this document, if any; otherwise, a new + * default Connection object. This can be used to continue a session, preserving settings and cookies, etc. + * @return the Connection (session) associated with this Document, or an empty one otherwise. + * @see Connection.newRequest + */ + /*fun connection(): Connection { + return if (connection == null) Jsoup.newSession() else connection + }*/ + + /** + * Returns this Document's doctype. + * @return document type, or null if not set + */ + /*@Nullable*/ + fun documentType(): DocumentType? { + for (node in childNodes) { + if (node is DocumentType) { + return node + } else if (node !is LeafNode) { + // scans forward across comments, text, processing instructions etc + break + } + } + return null + // todo - add a set document type? + } + + /** + * Find the root HTML element, or create it if it doesn't exist. + * @return the root HTML element. + */ + private fun htmlEl(): Element { + var el: Element? = firstElementChild() + while (el != null) { + if (el.normalName() == "html") return el + el = el.nextElementSibling() + } + return appendElement("html") + } + + /** + * Get this document's `head` element. + * + * + * As a side-effect, if this Document does not already have a HTML structure, it will be created. If you do not want + * that, use `#selectFirst("head")` instead. + * + * @return `head` element. + */ + fun head(): Element { + val html: Element = htmlEl() + var el: Element? = html.firstElementChild() + while (el != null) { + if (el.normalName() == "head") return el + el = el.nextElementSibling() + } + return html.prependElement("head") + } + + /** + * Get this document's `` or `` element. + * + * + * As a **side-effect**, if this Document does not already have a HTML structure, it will be created with a `` element. If you do not want that, use `#selectFirst("body")` instead. + * + * @return `body` element for documents with a ``, a new `` element if the document + * had no contents, or the outermost ` element` for frameset documents. + */ + fun body(): Element { + val html: Element = htmlEl() + var el: Element? = html.firstElementChild() + while (el != null) { + if ("body" == el.normalName() || "frameset" == el.normalName()) return el + el = el.nextElementSibling() + } + return html.appendElement("body") + } + + /** + * Get each of the `
` elements contained in this document. + * @return a List of FormElement objects, which will be empty if there are none. + * @see Elements.forms + * @see FormElement.elements + * @since 1.15.4 + */ + fun forms(): List { + return select("form").forms() + } + + /** + * Selects the first [FormElement] in this document that matches the query. If none match, throws an + * [IllegalArgumentException]. + * @param cssQuery a [Selector] CSS query + * @return the first matching `` element + * @throws IllegalArgumentException if no match is found + * @since 1.15.4 + */ + fun expectForm(cssQuery: String): FormElement? { + val els: Elements = select(cssQuery) + for (el in els) { + if (el is FormElement) return el + } + Validate.fail("No form elements matched the query '$cssQuery' in the document.") + return null // (not really) + } + + /** + * Get the string contents of the document's `title` element. + * @return Trimmed title, or empty string if none set. + */ + fun title(): String { + // title is a preserve whitespace tag (for document output), but normalised here + val titleEl: Element? = head().selectFirst(titleEval) + return if (titleEl != null) StringUtil.normaliseWhitespace(titleEl.text()).trim() else "" + } + + /** + * Create a new, empty Document, in the specified namespace. + * @param namespace the namespace of this Document's root node. + * @param baseUri base URI of document + * @see com.fleeksoft.ksoup.Jsoup.parse + * @see .createShell + */ + init { + parser = Parser.htmlParser() // default, but overridable + } + + /** + * Set the document's `title` element. Updates the existing element, or adds `title` to `head` if + * not present + * @param title string to set as title + */ + fun title(title: String) { + Validate.notNull(title) + var titleEl: Element? = head().selectFirst(titleEval) + if (titleEl == null) { + // add to head + titleEl = head().appendElement("title") + } + titleEl.text(title) + } + + /** + * Create a new Element, with this document's base uri. Does not make the new element a child of this document. + * @param tagName element tag name (e.g. `a`) + * @return new element + */ + fun createElement(tagName: String): Element { + return Element( + Tag.valueOf( + tagName, + parser!!.defaultNamespace(), + ParseSettings.preserveCase, + ), + this.baseUri(), + ) + } + + override fun outerHtml(): String { + return super.html() // no outer wrapper tag + } + + /** + * Set the text of the `body` of this document. Any existing nodes within the body will be cleared. + * @param text unencoded text + * @return this document + */ + override fun text(text: String): Element { + body().text(text) // overridden to not nuke doc structure + return this + } + + override fun nodeName(): String { + return "#document" + } + + /** + * Sets the charset used in this document. This method is equivalent + * to [ OutputSettings.charset(Charset)][OutputSettings.charset] but in addition it updates the + * charset / encoding element within the document. + * + * + * This enables + * [meta charset update][.updateMetaCharsetElement]. + * + * + * If there's no element with charset / encoding information yet it will + * be created. Obsolete charset / encoding definitions are removed! + * + * + * **Elements used:** + * + * + * * **Html:** *<meta charset="CHARSET">* + * * **Xml:** *<?xml version="1.0" encoding="CHARSET">* + * + * + * @param charset Charset + * + * @see .updateMetaCharsetElement + * @see OutputSettings.charset + */ + fun charset(charset: Charset) { + updateMetaCharsetElement(true) + outputSettings.charset(charset) + ensureMetaCharsetElement() + } + + /** + * Returns the charset used in this document. This method is equivalent + * to [OutputSettings.charset]. + * + * @return Current Charset + * + * @see OutputSettings.charset + */ + fun charset(): Charset { + return outputSettings.charset() + } + + /** + * Sets whether the element with charset information in this document is + * updated on changes through [ Document.charset(Charset)][.charset] or not. + * + * + * If set to false *(default)* there are no elements + * modified. + * + * @param update If true the element updated on charset + * changes, false if not + * + * @see .charset + */ + fun updateMetaCharsetElement(update: Boolean) { + updateMetaCharset = update + } + + /** + * Returns whether the element with charset information in this document is + * updated on changes through [ Document.charset(Charset)][.charset] or not. + * + * @return Returns true if the element is updated on charset + * changes, false if not + */ + fun updateMetaCharsetElement(): Boolean { + return updateMetaCharset + } + + override fun clone(): Document { + return super.clone() as Document + } + + override fun createClone(): Node { + val document = Document(namespace, location) + document.outputSettings = this.outputSettings.clone() + return document as Document + } + + override fun shallowClone(): Document { + val clone = Document(this.tag().namespace(), baseUri()) + if (attributes != null) clone.attributes = attributes!!.clone() + clone.outputSettings = outputSettings.clone() + return clone + } + + /** + * Ensures a meta charset (html) or xml declaration (xml) with the current + * encoding used. This only applies with + * [updateMetaCharset][.updateMetaCharsetElement] set to + * true, otherwise this method does nothing. + * + * + * * An existing element gets updated with the current charset + * * If there's no element yet it will be inserted + * * Obsolete elements are removed + * + * + * + * **Elements used:** + * + * + * * **Html:** *<meta charset="CHARSET">* + * * **Xml:** *<?xml version="1.0" encoding="CHARSET">* + * + */ + private fun ensureMetaCharsetElement() { + if (updateMetaCharset) { + val syntax = outputSettings().syntax() + if (syntax == OutputSettings.Syntax.html) { + val metaCharset: Element? = selectFirst("meta[charset]") + if (metaCharset != null) { + metaCharset.attr("charset", charset().name) + } else { + head().appendElement("meta").attr("charset", charset().name) + } + select("meta[name=charset]").remove() // Remove obsolete elements + } else if (syntax == OutputSettings.Syntax.xml) { + val node: Node = ensureChildNodes().get(0) + if (node is XmlDeclaration) { + var decl: XmlDeclaration = node + if (decl.name() == "xml") { + decl.attr("encoding", charset().name) + if (decl.hasAttr("version")) decl.attr("version", "1.0") + } else { + decl = XmlDeclaration("xml", false) + decl.attr("version", "1.0") + decl.attr("encoding", charset().name) + prependChild(decl) + } + } else { + val decl = XmlDeclaration("xml", false) + decl.attr("version", "1.0") + decl.attr("encoding", charset().name) + prependChild(decl) + } + } + } + } + + /** + * A Document's output settings control the form of the text() and html() methods. + */ + data class OutputSettings( + private var escapeMode: Entities.EscapeMode = Entities.EscapeMode.base, + private var charset: Charset = DataUtil.UTF_8, + var coreCharset: Entities.CoreCharset = Entities.CoreCharset.byName(charset.name), // fast encoders for ascii and utf8 + private var prettyPrint: Boolean = true, + private var outline: Boolean = false, + private var indentAmount: Int = 1, + private var maxPaddingWidth: Int = 30, + private var syntax: Syntax = Syntax.html, + ) : Cloneable { + private var charsetEncoder: CharsetEncoder? = null + + /** + * The output serialization syntax. + */ + enum class Syntax { + html, xml + } + + /** + * Get the document's current HTML escape mode: `base`, which provides a limited set of named HTML + * entities and escapes other characters as numbered entities for maximum compatibility; or `extended`, + * which uses the complete set of HTML named entities. + * + * + * The default escape mode is `base`. + * @return the document's current escape mode + */ + fun escapeMode(): Entities.EscapeMode { + return escapeMode + } + + /** + * Set the document's escape mode, which determines how characters are escaped when the output character set + * does not support a given character:- using either a named or a numbered escape. + * @param escapeMode the new escape mode to use + * @return the document's output settings, for chaining + */ + fun escapeMode(escapeMode: Entities.EscapeMode): OutputSettings { + this.escapeMode = escapeMode + return this + } + + /** + * Get the document's current output charset, which is used to control which characters are escaped when + * generating HTML (via the `html()` methods), and which are kept intact. + * + * + * Where possible (when parsing from a URL or File), the document's output charset is automatically set to the + * input charset. Otherwise, it defaults to UTF-8. + * @return the document's current charset. + */ + fun charset(): Charset { + return charset + } + + /** + * Update the document's output charset. + * @param charset the new charset to use. + * @return the document's output settings, for chaining + */ + fun charset(charset: Charset): OutputSettings { + this.charset = charset + coreCharset = Entities.CoreCharset.byName(charset.name) + return this + } + + /** + * Update the document's output charset. + * @param charset the new charset (by name) to use. + * @return the document's output settings, for chaining + */ + fun charset(charset: String): OutputSettings { + charset(Charset.forName(charset)) + return this + } + + fun prepareEncoder(): CharsetEncoder { + // created at start of OuterHtmlVisitor so each pass has own encoder, so OutputSettings can be shared among threads + charsetEncoder = charset.newEncoder() + return charsetEncoder!! + } + + fun encoder(): CharsetEncoder { + return charsetEncoder ?: prepareEncoder() + } + + /** + * Get the document's current output syntax. + * @return current syntax + */ + fun syntax(): Syntax { + return syntax + } + + /** + * Set the document's output syntax. Either `html`, with empty tags and boolean attributes (etc), or + * `xml`, with self-closing tags. + * + * When set to [xml][Document.OutputSettings.Syntax.xml], the [escapeMode][.escapeMode] is + * automatically set to [Entities.EscapeMode.xhtml], but may be subsequently changed if desired. + * @param syntax serialization syntax + * @return the document's output settings, for chaining + */ + fun syntax(syntax: Syntax): OutputSettings { + this.syntax = syntax + if (syntax == Syntax.xml) this.escapeMode(Entities.EscapeMode.xhtml) + return this + } + + /** + * Get if pretty printing is enabled. Default is true. If disabled, the HTML output methods will not re-format + * the output, and the output will generally look like the input. + * @return if pretty printing is enabled. + */ + fun prettyPrint(): Boolean { + return prettyPrint + } + + /** + * Enable or disable pretty printing. + * @param pretty new pretty print setting + * @return this, for chaining + */ + fun prettyPrint(pretty: Boolean): OutputSettings { + prettyPrint = pretty + return this + } + + /** + * Get if outline mode is enabled. Default is false. If enabled, the HTML output methods will consider + * all tags as block. + * @return if outline mode is enabled. + */ + fun outline(): Boolean { + return outline + } + + /** + * Enable or disable HTML outline mode. + * @param outlineMode new outline setting + * @return this, for chaining + */ + fun outline(outlineMode: Boolean): OutputSettings { + outline = outlineMode + return this + } + + /** + * Get the current tag indent amount, used when pretty printing. + * @return the current indent amount + */ + fun indentAmount(): Int { + return indentAmount + } + + /** + * Set the indent amount for pretty printing + * @param indentAmount number of spaces to use for indenting each level. Must be >= 0. + * @return this, for chaining + */ + fun indentAmount(indentAmount: Int): OutputSettings { + Validate.isTrue(indentAmount >= 0) + this.indentAmount = indentAmount + return this + } + + /** + * Get the current max padding amount, used when pretty printing + * so very deeply nested nodes don't get insane padding amounts. + * @return the current indent amount + */ + fun maxPaddingWidth(): Int { + return maxPaddingWidth + } + + /** + * Set the max padding amount for pretty printing so very deeply nested nodes don't get insane padding amounts. + * @param maxPaddingWidth number of spaces to use for indenting each level of nested nodes. Must be >= -1. + * Default is 30 and -1 means unlimited. + * @return this, for chaining + */ + fun maxPaddingWidth(maxPaddingWidth: Int): OutputSettings { + Validate.isTrue(maxPaddingWidth >= -1) + this.maxPaddingWidth = maxPaddingWidth + return this + } + + override fun clone(): OutputSettings { + return this.copy() + } + } + + /** + * Get the document's current output settings. + * @return the document's current output settings. + */ + fun outputSettings(): OutputSettings { + return outputSettings + } + + /** + * Set the document's output settings. + * @param outputSettings new output settings. + * @return this document, for chaining. + */ + fun outputSettings(outputSettings: OutputSettings): Document { + Validate.notNull(outputSettings) + this.outputSettings = outputSettings + return this + } + + enum class QuirksMode { + noQuirks, quirks, limitedQuirks + } + + fun quirksMode(): QuirksMode { + return quirksMode + } + + fun quirksMode(quirksMode: QuirksMode): Document { + this.quirksMode = quirksMode + return this + } + + /** + * Get the parser that was used to parse this document. + * @return the parser + */ + fun parser(): Parser? { + return parser + } + + /** + * Set the parser used to create this document. This parser is then used when further parsing within this document + * is required. + * @param parser the configured parser to use when further parsing is required for this document. + * @return this document, for chaining. + */ + fun parser(parser: Parser?): Document { + this.parser = parser + return this + } + + /** + * Set the Connection used to fetch this document. This Connection is used as a session object when further requests are + * made (e.g. when a form is submitted). + * + * @param connection to set + * @return this document, for chaining + * @see Connection.newRequest + * @since 1.14.1 + */ + /*fun connection(connection: Connection?): Document { + Validate.notNull(connection) + this.connection = connection + return this + }*/ + + companion object { + /** + * Create a valid, empty shell of a document, suitable for adding more elements to. + * @param baseUri baseUri of document + * @return document with html, head, and body elements. + */ + fun createShell(baseUri: String?): Document { + Validate.notNull(baseUri) + val doc = Document(baseUri) + doc.parser = doc.parser() + val html: Element = doc.appendElement("html") + html.appendElement("head") + html.appendElement("body") + return doc + } + + private val titleEval: Evaluator = Evaluator.Tag("title") + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/DocumentType.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/DocumentType.kt new file mode 100644 index 00000000..a6755e67 --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/DocumentType.kt @@ -0,0 +1,108 @@ +package com.fleeksoft.ksoup.nodes + +import okio.IOException +import com.fleeksoft.ksoup.helper.Validate +import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.nodes.Document.OutputSettings.Syntax + +/** + * A `` node. + */ +/*Validate.notNull(name) + Validate.notNull(publicId) + Validate.notNull(systemId)*/ +class DocumentType(private val name: String, private val publicId: String, private val systemId: String) : LeafNode() { + // todo: quirk mode from publicId and systemId + /** + * Create a new doctype element. + * @param name the doctype's name + * @param publicId the doctype's public ID + * @param systemId the doctype's system ID + */ + init { + attr(NAME, name) + attr(PUBLIC_ID, publicId) + attr(SYSTEM_ID, systemId) + updatePubSyskey() + } + + fun setPubSysKey(value: String?) { + if (value != null) attr(PUB_SYS_KEY, value) + } + + private fun updatePubSyskey() { + if (has(PUBLIC_ID)) { + attr(PUB_SYS_KEY, PUBLIC_KEY) + } else if (has(SYSTEM_ID)) attr(PUB_SYS_KEY, SYSTEM_KEY) + } + + /** + * Get this doctype's name (when set, or empty string) + * @return doctype name + */ + fun name(): String { + return attr(NAME) + } + + /** + * Get this doctype's Public ID (when set, or empty string) + * @return doctype Public ID + */ + fun publicId(): String { + return attr(PUBLIC_ID) + } + + /** + * Get this doctype's System ID (when set, or empty string) + * @return doctype System ID + */ + fun systemId(): String { + return attr(SYSTEM_ID) + } + + override fun nodeName(): String { + return "#doctype" + } + + @Throws(IOException::class) + override fun outerHtmlHead(accum: Appendable, depth: Int, out: Document.OutputSettings) { + // add a newline if the doctype has a preceding node (which must be a comment) + if (siblingIndex > 0 && out.prettyPrint()) accum.append('\n') + if (out.syntax() === Syntax.html && !has(PUBLIC_ID) && !has(SYSTEM_ID)) { + // looks like a html5 doctype, go lowercase for aesthetics + accum.append("') + } + + override fun outerHtmlTail( + accum: Appendable, + depth: Int, + out: Document.OutputSettings, + ) { + } + + override fun createClone(): Node { + return DocumentType(this.name, this.publicId, this.systemId) + } + + private fun has(attribute: String): Boolean { + return !StringUtil.isBlank(attr(attribute)) + } + + companion object { + // todo needs a bit of a chunky cleanup. this level of detail isn't needed + const val PUBLIC_KEY = "PUBLIC" + const val SYSTEM_KEY = "SYSTEM" + private const val NAME = "name" + private const val PUB_SYS_KEY = "pubSysKey" // PUBLIC or SYSTEM + private const val PUBLIC_ID = "publicId" + private const val SYSTEM_ID = "systemId" + } +} diff --git a/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Element.kt b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Element.kt new file mode 100644 index 00000000..387db02c --- /dev/null +++ b/ksoup/src/commonMain/kotlin/com/fleeksoft/ksoup/nodes/Element.kt @@ -0,0 +1,1909 @@ +package com.fleeksoft.ksoup.nodes + +import okio.IOException +import com.fleeksoft.ksoup.helper.ChangeNotifyingArrayList +import com.fleeksoft.ksoup.helper.Validate +import com.fleeksoft.ksoup.internal.Normalizer.normalize +import com.fleeksoft.ksoup.internal.StringUtil +import com.fleeksoft.ksoup.nodes.TextNode.Companion.lastCharIsWhitespace +import com.fleeksoft.ksoup.parser.ParseSettings +import com.fleeksoft.ksoup.parser.Parser +import com.fleeksoft.ksoup.parser.Tag +import com.fleeksoft.ksoup.parser.TokenQueue.Companion.escapeCssIdentifier +import com.fleeksoft.ksoup.select.Collector +import com.fleeksoft.ksoup.select.Elements +import com.fleeksoft.ksoup.select.Evaluator +import com.fleeksoft.ksoup.select.NodeFilter +import com.fleeksoft.ksoup.select.NodeTraversor +import com.fleeksoft.ksoup.select.NodeVisitor +import com.fleeksoft.ksoup.select.QueryParser +import com.fleeksoft.ksoup.select.Selector +import com.fleeksoft.ksoup.ported.AtomicBoolean +import com.fleeksoft.ksoup.ported.Collections +import com.fleeksoft.ksoup.ported.Consumer +import com.fleeksoft.ksoup.ported.PatternSyntaxException +import kotlin.jvm.JvmOverloads + +/** + * An HTML Element consists of a tag name, attributes, and child nodes (including text nodes and other elements). + * + * + * From an Element, you can extract data, traverse the node graph, and manipulate the HTML. + */ + +open class Element : Node { + private var tag: Tag + private var _baseUri: String? = null // just for clone + + // points to child elements shadowed from node children + private var shadowChildrenRef: List? = null + var childNodes: MutableList = EmptyNodes + + // field is nullable but all methods for attributes are non-null + var attributes: Attributes? = null + + /** + * Create a new, standalone element, in the specified namespace. + * @param tag tag name + * @param namespace namespace for this element + */ + constructor(tag: String, namespace: String) : this( + Tag.valueOf( + tag, + namespace, + ParseSettings.preserveCase, + ), + null, + ) + + /** + * Create a new, standalone element, in the HTML namespace. + * @param tag tag name + * @see .Element + */ + constructor(tag: String) : this( + Tag.valueOf( + tag, + Parser.NamespaceHtml, + ParseSettings.preserveCase, + ), + "", + null, + ) + + /** + * Create a new, standalone Element. (Standalone in that it has no parent.) + * + * @param tag tag of this element + * @param baseUri the base URI (optional, may be null to inherit from parent, or "" to clear parent's) + * @param attributes initial attributes (optional, may be null) + * @see #appendChild(Node) + * @see #appendElement(String) + */ + constructor(tag: Tag, baseUri: String?, attributes: Attributes?) { + childNodes = EmptyNodes.toMutableList() + this.attributes = attributes + this.tag = tag + _baseUri = baseUri + if (baseUri != null) this.setBaseUri(baseUri) + } + + /** + * Create a new Element from a Tag and a base URI. + * + * @param tag element tag + * @param baseUri the base URI of this element. Optional, and will inherit from its parent, if any. + * @see Tag.valueOf + */ + constructor(tag: Tag, baseUri: String?) : this(tag, baseUri, null) + + /** + * Internal test to check if a nodelist object has been created. + */ + fun hasChildNodes(): Boolean { + return childNodes != EmptyNodes + } + + public override fun ensureChildNodes(): MutableList { + if (childNodes == EmptyNodes) { + childNodes = NodeList(owner = this, initialCapacity = 4) as MutableList + } + return childNodes + } + + public override fun hasAttributes(): Boolean { + return attributes != null + } + + override fun attributes(): Attributes { + if (attributes == null) { + // not using hasAttributes, as doesn't clear warning + attributes = Attributes() + } + return attributes!! + } + + override fun baseUri(): String { + return searchUpForAttribute(this, BaseUriKey) + } + + public override fun doSetBaseUri(baseUri: String?) { + attributes().put(BaseUriKey, baseUri) + } + + override fun childNodeSize(): Int { + return childNodes.size + } + + override fun nodeName(): String { + return tag.name + } + + /** + * Get the name of the tag for this element. E.g. `div`. If you are using [ case preserving parsing][ParseSettings.preserveCase], this will return the source's original case. + * + * @return the tag name + */ + fun tagName(): String { + return tag.name + } + + /** + * Get the normalized name of this Element's tag. This will always be the lower-cased version of the tag, regardless + * of the tag case preserving setting of the parser. For e.g., `
` and `
` both have a + * normal name of `div`. + * @return normal name + */ + override fun normalName(): String { + return tag.normalName() + } + + + /** + * Change (rename) the tag of this element. For example, convert a `` to a `
` with + * `el.tagName("div");`. + * + * @param tagName new tag name for this element + * @param namespace the new namespace for this element + * @return this element, for chaining + * @see Elements.tagName + */ + /** + * Change (rename) the tag of this element. For example, convert a `` to a `
` with + * `el.tagName("div");`. + * + * @param tagName new tag name for this element + * @return this element, for chaining + * @see Elements.tagName + */ + @JvmOverloads + fun tagName(tagName: String, namespace: String = tag.namespace()): Element { + Validate.notEmptyParam(tagName, "tagName") + Validate.notEmptyParam(namespace, "namespace") + tag = Tag.valueOf( + tagName, + namespace, + NodeUtils.parser(this)!!.settings(), + ) // maintains the case option of the original parse + return this + } + + /** + * Get the Tag for this element. + * + * @return the tag object + */ + fun tag(): Tag { + return tag + } + + fun isBlock(): Boolean = tag.isBlock + + /** + * Get the `id` attribute of this element. + * + * @return The id attribute, if present, or an empty string if not. + */ + fun id(): String { + return if (attributes != null) attributes!!.getIgnoreCase("id") else "" + } + + /** + * Set the `id` attribute of this element. + * @param id the ID value to use + * @return this Element, for chaining + */ + fun id(id: String?): Element { + Validate.notNull(id) + attr("id", id) + return this + } + + /** + * Set an attribute value on this element. If this element already has an attribute with the + * key, its value is updated; otherwise, a new attribute is added. + * + * @return this element + */ + override fun attr(attributeKey: String, attributeValue: String?): Element { + super.attr(attributeKey, attributeValue) + return this + } + + /** + * Set a boolean attribute value on this element. Setting to `true` sets the attribute value to "" and + * marks the attribute as boolean so no value is written out. Setting to `false` removes the attribute + * with the same key if it exists. + * + * @param attributeKey the attribute key + * @param attributeValue the attribute value + * + * @return this element + */ + fun attr(attributeKey: String, attributeValue: Boolean): Element { + attributes().put(attributeKey, attributeValue) + return this + } + + /** + * Get this element's HTML5 custom data attributes. Each attribute in the element that has a key + * starting with "data-" is included the dataset. + * + * + * E.g., the element `
...` has the dataset + * `package=com.fleeksoft.ksoup, language=java`. + * + * + * This map is a filtered view of the element's attribute map. Changes to one map (add, remove, update) are reflected + * in the other map. + * + * + * You can find elements that have data attributes using the `[^data-]` attribute key prefix selector. + * @return a map of `key=value` custom data attributes. + */ + fun dataset(): Attributes.Dataset { + return attributes().dataset() + } + + override fun parent(): Element? { + return _parentNode as? Element + } + + /** + * Get this element's parent and ancestors, up to the document root. + * @return this element's stack of parents, starting with the closest first. + */ + fun parents(): Elements { + val parents = Elements() + var parent = parent() + while (parent != null && !parent.isNode("#root")) { + parents.add(parent) + parent = parent.parent() + } + return parents + } + + /** + * Get a child element of this element, by its 0-based index number. + * + * + * Note that an element can have both mixed Nodes and Elements as children. This method inspects + * a filtered list of children that are elements, and the index is based on that filtered list. + * + * + * @param index the index number of the element to retrieve + * @return the child element, if it exists, otherwise throws an `IndexOutOfBoundsException` + * @see .childNode + */ + fun child(index: Int): Element { + return childElementsList()[index] + } + + /** + * Get the number of child nodes of this element that are elements. + * + * + * This method works on the same filtered list like [.child]. Use [.childNodes] and [ ][.childNodeSize] to get the unfiltered Nodes (e.g. includes TextNodes etc.) + * + * + * @return the number of child nodes that are elements + * @see .children + * @see .child + */ + fun childrenSize(): Int { + return childElementsList().size + } + + /** + * Get this element's child elements. + * + * + * This is effectively a filter on [.childNodes] to get Element nodes. + * + * @return child elements. If this element has no children, returns an empty list. + * @see .childNodes + */ + fun children(): Elements { + return Elements(childElementsList()) + } + + /** + * Maintains a shadow copy of this element's child elements. If the nodelist is changed, this cache is invalidated. + * TODO - think about pulling this out as a helper as there are other shadow lists (like in Attributes) kept around. + * @return a list of child elements + */ + fun childElementsList(): List { + if (childNodeSize() == 0) return EmptyChildren // short circuit creating empty + var children: MutableList? = null + if (shadowChildrenRef != null) { + children = shadowChildrenRef!!.toMutableList() + } + if (shadowChildrenRef == null || children == null) { + val size = childNodes.size + children = ArrayList(size) + for (i in 0 until size) { + val node: Node = childNodes[i] + if (node is Element) children.add(node) + } + shadowChildrenRef = children + } + return children + } + + /** + * Clears the cached shadow child elements. + */ + override fun nodelistChanged() { + super.nodelistChanged() + shadowChildrenRef = null + } + + /** + * Get this element's child text nodes. The list is unmodifiable but the text nodes may be manipulated. + * + * + * This is effectively a filter on [.childNodes] to get Text nodes. + * @return child text nodes. If this element has no text nodes, returns an + * empty list. + * + * For example, with the input HTML: `

One Two Three
Four

` with the `p` element selected: + * + * * `p.text()` = `"One Two Three Four"` + * * `p.ownText()` = `"One Three Four"` + * * `p.children()` = `Elements[,
]` + * * `p.childNodes()` = `List["One ", , " Three ",
, " Four"]` + * * `p.textNodes()` = `List["One ", " Three ", " Four"]` + * + */ + fun textNodes(): List { + val textNodes: MutableList = ArrayList() + for (node in childNodes) { + if (node is TextNode) textNodes.add(node) + } + return Collections.unmodifiableList(textNodes) + } + + /** + * Get this element's child data nodes. The list is unmodifiable but the data nodes may be manipulated. + * + * + * This is effectively a filter on [.childNodes] to get Data nodes. + * + * @return child data nodes. If this element has no data nodes, returns an + * empty list. + * @see .data + */ + fun dataNodes(): List { + val dataNodes: MutableList = ArrayList() + for (node in childNodes) { + if (node is DataNode) dataNodes.add(node as DataNode) + } + return Collections.unmodifiableList(dataNodes) + } + + /** + * Find elements that match the [Selector] CSS query, with this element as the starting context. Matched elements + * may include this element, or any of its children. + * + * This method is generally more powerful to use than the DOM-type `getElementBy*` methods, because + * multiple filters can be combined, e.g.: + * + * * `el.select("a[href]")` - finds links (`a` tags with `href` attributes) + * * `el.select("a[href*=example.com]")` - finds links pointing to example.com (loosely) + * + * + * See the query syntax documentation in [com.fleeksoft.ksoup.select.Selector]. + * + * Also known as `querySelectorAll()` in the Web DOM. + * + * @param cssQuery a [Selector] CSS-like query + * @return an [Elements] list containing elements that match the query (empty if none match) + * @see Selector selector query syntax + * + * @see QueryParser.parse + * @throws Selector.SelectorParseException (unchecked) on an invalid CSS query. + */ + fun select(cssQuery: String): Elements { + return Selector.select(cssQuery, this) + } + + /** + * Find elements that match the supplied Evaluator. This has the same functionality as [.select], but + * may be useful if you are running the same query many times (on many documents) and want to save the overhead of + * repeatedly parsing the CSS query. + * @param evaluator an element evaluator + * @return an [Elements] list containing elements that match the query (empty if none match) + */ + fun select(evaluator: Evaluator): Elements { + return Selector.select(evaluator, this) + } + + /** + * Find the first Element that matches the [Selector] CSS query, with this element as the starting context. + * + * This is effectively the same as calling `element.select(query).first()`, but is more efficient as query + * execution stops on the first hit. + * + * Also known as `querySelector()` in the Web DOM. + * @param cssQuery cssQuery a [Selector] CSS-like query + * @return the first matching element, or **`null`** if there is no match. + * @see .expectFirst + */ + fun selectFirst(cssQuery: String): Element? { + return Selector.selectFirst(cssQuery, this) + } + + /** + * Finds the first Element that matches the supplied Evaluator, with this element as the starting context, or + * `null` if none match. + * + * @param evaluator an element evaluator + * @return the first matching element (walking down the tree, starting from this element), or `null` if none + * match. + */ + fun selectFirst(evaluator: Evaluator): Element? { + return Collector.findFirst(evaluator, this) + } + + /** + * Just like [.selectFirst], but if there is no match, throws an [IllegalArgumentException]. This + * is useful if you want to simply abort processing on a failed match. + * @param cssQuery a [Selector] CSS-like query + * @return the first matching element + * @throws IllegalArgumentException if no match is found + * @since 1.15.2 + */ + fun expectFirst(cssQuery: String): Element { + return Validate.ensureNotNull( + Selector.selectFirst(cssQuery, this), + if (parent() != null) "No elements matched the query '$cssQuery' on element '${this.tagName()}'." else "No elements matched the query '$cssQuery' in the document.", + ) as Element + } + + /** + * Checks if this element matches the given [Selector] CSS query. Also knows as `matches()` in the Web + * DOM. + * + * @param cssQuery a [Selector] CSS query + * @return if this element matches the query + */ + fun `is`(cssQuery: String): Boolean { + return `is`(QueryParser.parse(cssQuery)) + } + + /** + * Check if this element matches the given evaluator. + * @param evaluator an element evaluator + * @return if this element matches + */ + fun `is`(evaluator: Evaluator?): Boolean { + return evaluator!!.matches(root(), this) + } + + /** + * Find the closest element up the tree of parents that matches the specified CSS query. Will return itself, an + * ancestor, or `null` if there is no such matching element. + * @param cssQuery a [Selector] CSS query + * @return the closest ancestor element (possibly itself) that matches the provided evaluator. `null` if not + * found. + */ + fun closest(cssQuery: String): Element? { + return closest(QueryParser.parse(cssQuery)) + } + + /** + * Find the closest element up the tree of parents that matches the specified evaluator. Will return itself, an + * ancestor, or `null` if there is no such matching element. + * @param evaluator a query evaluator + * @return the closest ancestor element (possibly itself) that matches the provided evaluator. `null` if not + * found. + */ +// @Nullable + fun closest(evaluator: Evaluator?): Element? { + Validate.notNull(evaluator) + var el: Element? = this + val root = root() + do { + if (evaluator!!.matches(root, el!!)) return el + el = el.parent() + } while (el != null) + return null + } + + /** + * Find Elements that match the supplied XPath expression. + * + * Note that for convenience of writing the Xpath expression, namespaces are disabled, and queries can be + * expressed using the element's local name only. + * + * By default, XPath 1.0 expressions are supported. If you would to use XPath 2.0 or higher, you can provide an + * alternate XPathFactory implementation: + * + * 1. Add the implementation to your classpath. E.g. to use [Saxon-HE](https://www.saxonica.com/products/products.xml), add [net.sf.saxon:Saxon-HE](https://mvnrepository.com/artifact/net.sf.saxon/Saxon-HE) to your build. + * 1. Set the system property `javax.xml.xpath.XPathFactory:com.fleeksoft.ksoup` to the implementing classname. E.g.:

+ * `System.setProperty(W3CDom.XPathFactoryProperty, "net.sf.saxon.xpath.XPathFactoryImpl");` + * + * + * + * @param xpath XPath expression + * @return matching elements, or an empty list if none match. + * @see .selectXpath + * @since 1.14.3 + */ + /*fun selectXpath(xpath: String?): Elements { + return Elements(NodeUtils.selectXpath(xpath, this, Element::class)) + }*/ + + /** + * Find Nodes that match the supplied XPath expression. + * + * For example, to select TextNodes under `p` elements: + *
List<TextNode> textNodes = doc.selectXpath("//body//p//text()", TextNode.class);
+ * + * Note that in the com.fleeksoft.ksoup DOM, Attribute objects are not Nodes. To directly select attribute values, do something + * like: + *
List<String> hrefs = doc.selectXpath("//a").eachAttr("href");
+ * @param xpath XPath expression + * @param nodeType the com.fleeksoft.ksoup node type to return + * @see .selectXpath + * @return a list of matching nodes + * @since 1.14.3 + */ + /*fun selectXpath(xpath: String?, nodeType: KClass): List { + return NodeUtils.selectXpath(xpath, this, nodeType) + }*/ + + /** + * Insert a node to the end of this Element's children. The incoming node will be re-parented. + * + * @param child node to add. + * @return this Element, for chaining + * @see .prependChild + * @see .insertChildren + */ + fun appendChild(child: Node): Element { + // was - Node#addChildren(child). short-circuits an array create and a loop. + reparentChild(child) + ensureChildNodes() + childNodes.add(child) + child.siblingIndex = childNodes.size - 1 + return this + } + + /** + * Insert the given nodes to the end of this Element's children. + * + * @param children nodes to add + * @return this Element, for chaining + * @see .insertChildren + */ + fun appendChildren(children: Collection): Element { + insertChildren(-1, children) + return this + } + + /** + * Add this element to the supplied parent element, as its next child. + * + * @param parent element to which this element will be appended + * @return this element, so that you can continue modifying the element + */ + fun appendTo(parent: Element): Element { + Validate.notNull(parent) + parent.appendChild(this) + return this + } + + /** + * Add a node to the start of this element's children. + * + * @param child node to add. + * @return this element, so that you can add more child nodes or elements. + */ + fun prependChild(child: Node?): Element { + Validate.notNull(child) + addChildren(0, child!!) + return this + } + + /** + * Insert the given nodes to the start of this Element's children. + * + * @param children nodes to add + * @return this Element, for chaining + * @see .insertChildren + */ + fun prependChildren(children: Collection): Element { + insertChildren(0, children) + return this + } + + /** + * Inserts the given child nodes into this element at the specified index. Current nodes will be shifted to the + * right. The inserted nodes will be moved from their current parent. To prevent moving, copy the nodes first. + * + * @param index 0-based index to insert children at. Specify `0` to insert at the start, `-1` at the + * end + * @param children child nodes to insert + * @return this element, for chaining. + */ + fun insertChildren(index: Int, children: Collection): Element { + var index = index + val currentSize = childNodeSize() + if (index < 0) index += currentSize + 1 // roll around + Validate.isTrue(index in 0..currentSize, "Insert position out of bounds.") + val nodeArray: Array = children.toTypedArray() + addChildren(index, *nodeArray) + return this + } + + /** + * Inserts the given child nodes into this element at the specified index. Current nodes will be shifted to the + * right. The inserted nodes will be moved from their current parent. To prevent moving, copy the nodes first. + * + * @param index 0-based index to insert children at. Specify `0` to insert at the start, `-1` at the + * end + * @param children child nodes to insert + * @return this element, for chaining. + */ + fun insertChildren(index: Int, vararg children: Node): Element { + var index = index + val currentSize = childNodeSize() + if (index < 0) index += currentSize + 1 // roll around + Validate.isTrue(index in 0..currentSize, "Insert position out of bounds.") + addChildren(index, *children) + return this + } + + /** + * Create a new element by tag name, and add it as the last child. + * + * @param tagName the name of the tag (e.g. `div`). + * @return the new element, to allow you to add content to it, e.g.: + * `parent.appendElement("h1").attr("id", "header").text("Welcome");` + */ + @JvmOverloads + fun appendElement(tagName: String, namespace: String = tag.namespace()): Element { + val child = Element( + Tag.valueOf( + tagName, + namespace, + NodeUtils.parser(this)!! + .settings(), + ), + baseUri(), + ) + appendChild(child) + return child + } + + /** + * Create a new element by tag name, and add it as the first child. + * + * @param tagName the name of the tag (e.g. `div`). + * @return the new element, to allow you to add content to it, e.g.: + * `parent.prependElement("h1").attr("id", "header").text("Welcome");` + */ + @JvmOverloads + fun prependElement(tagName: String, namespace: String = tag.namespace()): Element { + val child = Element( + Tag.valueOf( + tagName, + namespace, + NodeUtils.parser(this)!! + .settings(), + ), + baseUri(), + ) + prependChild(child) + return child + } + + /** + * Create and append a new TextNode to this element. + * + * @param text the (un-encoded) text to add + * @return this element + */ + fun appendText(text: String): Element { + Validate.notNull(text) + val node = TextNode(text) + appendChild(node) + return this + } + + /** + * Create and prepend a new TextNode to this element. + * + * @param text the decoded text to add + * @return this element + */ + fun prependText(text: String): Element { + Validate.notNull(text) + val node = TextNode(text) + prependChild(node) + return this + } + + /** + * Add inner HTML to this element. The supplied HTML will be parsed, and each node appended to the end of the children. + * @param html HTML to add inside this element, after the existing HTML + * @return this element + * @see .html + */ + fun append(html: String): Element { + val nodes: List = NodeUtils.parser(this)!!.parseFragmentInput(html, this, baseUri()) + addChildren(*nodes.toTypedArray()) + return this + } + + /** + * Add inner HTML into this element. The supplied HTML will be parsed, and each node prepended to the start of the element's children. + * @param html HTML to add inside this element, before the existing HTML + * @return this element + * @see .html + */ + fun prepend(html: String): Element { + val nodes: List = NodeUtils.parser(this)!!.parseFragmentInput(html, this, baseUri()) + addChildren(0, *nodes.toTypedArray()) + return this + } + + /** + * Insert the specified HTML into the DOM before this element (as a preceding sibling). + * + * @param html HTML to add before this element + * @return this element, for chaining + * @see .after + */ + override fun before(html: String): Element { + return super.before(html) as Element + } + + /** + * Insert the specified node into the DOM before this node (as a preceding sibling). + * @param node to add before this element + * @return this Element, for chaining + * @see .after + */ + override fun before(node: Node?): Element { + return super.before(node) as Element + } + + /** + * Insert the specified HTML into the DOM after this element (as a following sibling). + * + * @param html HTML to add after this element + * @return this element, for chaining + * @see .before + */ + override fun after(html: String): Element { + return super.after(html) as Element + } + + /** + * Insert the specified node into the DOM after this node (as a following sibling). + * @param node to add after this element + * @return this element, for chaining + * @see .before + */ + override fun after(node: Node): Element { + return super.after(node) as Element + } + + /** + * Remove all the element's child nodes. Any attributes are left as-is. Each child node has its parent set to + * `null`. + * @return this element + */ + override fun empty(): Element { + // Detach each of the children -> parent links: + for (child in childNodes) { + child._parentNode = null + } + childNodes.clear() + return this + } + + /** + * Wrap the supplied HTML around this element. + * + * @param html HTML to wrap around this element, e.g. `
`. Can be arbitrarily deep. + * @return this element, for chaining. + */ + override fun wrap(html: String): Element { + return super.wrap(html) as Element + } + + /** + * Get a CSS selector that will uniquely select this element. + * + * + * If the element has an ID, returns #id; + * otherwise returns the parent (if any) CSS selector, followed by '>', + * followed by a unique selector for the element (tag.class.class:nth-child(n)). + * + * + * @return the CSS Path that can be used to retrieve the element in a selector. + */ + fun cssSelector(): String { + if (id().isNotEmpty()) { + // prefer to return the ID - but check that it's actually unique first! + val idSel = "#" + escapeCssIdentifier(id()) + val doc: Document? = ownerDocument() + if (doc != null) { + val els: Elements = doc.select(idSel) + if (els.size === 1 && els[0] === this) { + // otherwise, continue to the nth-child impl + return idSel + } + } else { + return idSel // no ownerdoc, return the ID selector + } + } + val selector: StringBuilder = StringUtil.borrowBuilder() + var el: Element? = this + while (el != null && el !is Document) { + selector.insert(0, el.cssSelectorComponent()) + el = el.parent() + } + return StringUtil.releaseBuilder(selector) + } + + private fun cssSelectorComponent(): String { + // Escape tagname, and translate HTML namespace ns:tag to CSS namespace syntax ns|tag + val tagName: String = escapeCssIdentifier(tagName()).replace("\\:", "|") + val selector: StringBuilder = StringUtil.borrowBuilder().append(tagName) + // String classes = StringUtil.join(classNames().stream().map(TokenQueue::escapeCssIdentifier).iterator(), "."); + // todo - replace with ^^ in 1.16.1 when we enable Android support for stream etc + val escapedClasses: StringUtil.StringJoiner = StringUtil.StringJoiner(".") + for (name in classNames()) escapedClasses.add(escapeCssIdentifier(name)) + val classes: String = escapedClasses.complete() + if (classes.isNotEmpty()) selector.append('.').append(classes) + val parent: Element? = parent() + if (parent == null || parent is Document) { + // don't add Document to selector, as will always have a html node + return StringUtil.releaseBuilder(selector) + } + selector.insert(0, " > ") + if (parent.select(selector.toString()).size > 1) { + selector.append(":nth-child(${elementSiblingIndex() + 1})") + } + return StringUtil.releaseBuilder(selector) + } + + /** + * Get sibling elements. If the element has no sibling elements, returns an empty list. An element is not a sibling + * of itself, so will not be included in the returned list. + * @return sibling elements + */ + fun siblingElements(): Elements { + if (_parentNode == null) return Elements() + val elements = (_parentNode as Element).childElementsList() + val siblings = Elements() + for (el in elements) if (el !== this) siblings.add(el) + return siblings + } + + /** + * Gets the next sibling element of this element. E.g., if a `div` contains two `p`s, + * the `nextElementSibling` of the first `p` is the second `p`. + * + * + * This is similar to [.nextSibling], but specifically finds only Elements + * + * @return the next element, or null if there is no next element + * @see .previousElementSibling + */ +// @Nullable + fun nextElementSibling(): Element? { + var next: Node = this + while (next.nextSibling()?.also { next = it } != null) { + if (next is Element) return next as Element + } + return null + } + + /** + * Get each of the sibling elements that come after this element. + * + * @return each of the element siblings after this element, or an empty list if there are no next sibling elements + */ + fun nextElementSiblings(): Elements { + return nextElementSiblings(true) + } + + /** + * Gets the previous element sibling of this element. + * @return the previous element, or null if there is no previous element + * @see .nextElementSibling + */ +// @Nullable + fun previousElementSibling(): Element? { + var prev: Node = this + while (prev.previousSibling()?.also { prev = it } != null) { + if (prev is Element) return prev as Element + } + return null + } + + /** + * Get each of the element siblings before this element. + * + * @return the previous element siblings, or an empty list if there are none. + */ + fun previousElementSiblings(): Elements { + return nextElementSiblings(false) + } + + private fun nextElementSiblings(next: Boolean): Elements { + val els = Elements() + if (_parentNode == null) return els + els.add(this) + return if (next) els.nextAll() else els.prevAll() + } + + /** + * Gets the first Element sibling of this element. That may be this element. + * @return the first sibling that is an element (aka the parent's first element child) + */ + fun firstElementSibling(): Element? { + val parent: Element? = parent() + return if (parent != null) { + parent.firstElementChild() + } else { + this // orphan is its own first sibling + } + } + + /** + * Get the list index of this element in its element sibling list. I.e. if this is the first element + * sibling, returns 0. + * @return position in element sibling list + */ + fun elementSiblingIndex(): Int { + val parent: Element? = parent() + return if (parent == null) { + 0 + } else { + indexInList( + this, + parent.childElementsList(), + ) + } + } + + /** + * Gets the last element sibling of this element. That may be this element. + * @return the last sibling that is an element (aka the parent's last element child) + */ + fun lastElementSibling(): Element? { + val parent: Element? = parent() + return if (parent != null) { + parent.lastElementChild() + } else { + this + } + } + + /** + * Gets the first child of this Element that is an Element, or `null` if there is none. + * @return the first Element child node, or null. + * @see .firstChild + * @see .lastElementChild + * @since 1.15.2 + */ +// @Nullable + fun firstElementChild(): Element? { + var child: Node? = firstChild() + while (child != null) { + if (child is Element) return child + child = child.nextSibling() + } + return null + } + + /** + * Gets the last child of this Element that is an Element, or @{code null} if there is none. + * @return the last Element child node, or null. + * @see .lastChild + * @see .firstElementChild + * @since 1.15.2 + */ +// @Nullable + fun lastElementChild(): Element? { + var child: Node? = lastChild() + while (child != null) { + if (child is Element) return child + child = child.previousSibling() + } + return null + } + // DOM type methods + /** + * Finds elements, including and recursively under this element, with the specified tag name. + * @param tagName The tag name to search for (case insensitively). + * @return a matching unmodifiable list of elements. Will be empty if this element and none of its children match. + */ + fun getElementsByTag(tagName: String?): Elements { + var tagName = tagName + Validate.notEmpty(tagName) + tagName = normalize(tagName) + return Collector.collect(Evaluator.Tag(tagName), this) + } + + /** + * Find an element by ID, including or under this element. + * + * + * Note that this finds the first matching ID, starting with this element. If you search down from a different + * starting point, it is possible to find a different element by ID. For unique element by ID within a Document, + * use [Document.getElementById] + * @param id The ID to search for. + * @return The first matching element by ID, starting with this element, or null if none found. + */ +// @Nullable + fun getElementById(id: String): Element? { + Validate.notEmpty(id) + val elements: Elements = Collector.collect(Evaluator.Id(id), this) + return if (elements.size > 0) elements[0] else null + } + + /** + * Find elements that have this class, including or under this element. Case-insensitive. + * + * + * Elements can have multiple classes (e.g. `
`). This method + * checks each class, so you can find the above with `el.getElementsByClass("header");`. + * + * @param className the name of the class to search for. + * @return elements with the supplied class name, empty if none + * @see .hasClass + * @see .classNames + */ + fun getElementsByClass(className: String): Elements { + Validate.notEmpty(className) + return Collector.collect(Evaluator.Class(className), this) + } + + /** + * Find elements that have a named attribute set. Case-insensitive. + * + * @param key name of the attribute, e.g. `href` + * @return elements that have this attribute, empty if none + */ + fun getElementsByAttribute(key: String): Elements { + var key = key + Validate.notEmpty(key) + key = key.trim { it <= ' ' } + return Collector.collect(Evaluator.Attribute(key), this) + } + + /** + * Find elements that have an attribute name starting with the supplied prefix. Use `data-` to find elements + * that have HTML5 datasets. + * @param keyPrefix name prefix of the attribute e.g. `data-` + * @return elements that have attribute names that start with the prefix, empty if none. + */ + fun getElementsByAttributeStarting(keyPrefix: String): Elements { + var keyPrefix = keyPrefix + Validate.notEmpty(keyPrefix) + keyPrefix = keyPrefix.trim { it <= ' ' } + return Collector.collect(Evaluator.AttributeStarting(keyPrefix), this) + } + + /** + * Find elements that have an attribute with the specific value. Case-insensitive. + * + * @param key name of the attribute + * @param value value of the attribute + * @return elements that have this attribute with this value, empty if none + */ + fun getElementsByAttributeValue(key: String, value: String): Elements { + return Collector.collect(Evaluator.AttributeWithValue(key, value), this) + } + + /** + * Find elements that either do not have this attribute, or have it with a different value. Case-insensitive. + * + * @param key name of the attribute + * @param value value of the attribute + * @return elements that do not have a matching attribute + */ + fun getElementsByAttributeValueNot(key: String, value: String): Elements { + return Collector.collect(Evaluator.AttributeWithValueNot(key, value), this) + } + + /** + * Find elements that have attributes that start with the value prefix. Case-insensitive. + * + * @param key name of the attribute + * @param valuePrefix start of attribute value + * @return elements that have attributes that start with the value prefix + */ + fun getElementsByAttributeValueStarting(key: String, valuePrefix: String): Elements { + return Collector.collect(Evaluator.AttributeWithValueStarting(key, valuePrefix), this) + } + + /** + * Find elements that have attributes that end with the value suffix. Case-insensitive. + * + * @param key name of the attribute + * @param valueSuffix end of the attribute value + * @return elements that have attributes that end with the value suffix + */ + fun getElementsByAttributeValueEnding(key: String, valueSuffix: String): Elements { + return Collector.collect(Evaluator.AttributeWithValueEnding(key, valueSuffix), this) + } + + /** + * Find elements that have attributes whose value contains the match string. Case-insensitive. + * + * @param key name of the attribute + * @param match substring of value to search for + * @return elements that have attributes containing this text + */ + fun getElementsByAttributeValueContaining(key: String, match: String): Elements { + return Collector.collect(Evaluator.AttributeWithValueContaining(key, match), this) + } + + /** + * Find elements that have an attribute whose value matches the supplied regular expression. + * @param key name of the attribute + * @param pattern compiled regular expression to match against attribute values + * @return elements that have attributes matching this regular expression + */ + fun getElementsByAttributeValueMatching( + key: String, + regex: Regex, + ): Elements { + return Collector.collect(Evaluator.AttributeWithValueMatching(key, regex), this) + } + + /** + * Find elements that have attributes whose values match the supplied regular expression. + * @param key name of the attribute + * @param regex regular expression to match against attribute values. You can use [embedded flags](http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded) (such as (?i) and (?m) to control regex options. + * @return elements that have attributes matching this regular expression + */ + fun getElementsByAttributeValueMatching(key: String, regex: String): Elements { + val pattern: Regex = try { + Regex(regex) + } catch (e: PatternSyntaxException) { + throw IllegalArgumentException("Pattern syntax error: $regex", e) + } + return getElementsByAttributeValueMatching(key, pattern) + } + + /** + * Find elements whose sibling index is less than the supplied index. + * @param index 0-based index + * @return elements less than index + */ + fun getElementsByIndexLessThan(index: Int): Elements { + return Collector.collect(Evaluator.IndexLessThan(index), this) + } + + /** + * Find elements whose sibling index is greater than the supplied index. + * @param index 0-based index + * @return elements greater than index + */ + fun getElementsByIndexGreaterThan(index: Int): Elements { + return Collector.collect(Evaluator.IndexGreaterThan(index), this) + } + + /** + * Find elements whose sibling index is equal to the supplied index. + * @param index 0-based index + * @return elements equal to index + */ + fun getElementsByIndexEquals(index: Int): Elements { + return Collector.collect(Evaluator.IndexEquals(index), this) + } + + /** + * Find elements that contain the specified string. The search is case-insensitive. The text may appear directly + * in the element, or in any of its descendants. + * @param searchText to look for in the element's text + * @return elements that contain the string, case-insensitive. + * @see Element.text + */ + fun getElementsContainingText(searchText: String): Elements { + return Collector.collect(Evaluator.ContainsText(searchText), this) + } + + /** + * Find elements that directly contain the specified string. The search is case-insensitive. The text must appear directly + * in the element, not in any of its descendants. + * @param searchText to look for in the element's own text + * @return elements that contain the string, case-insensitive. + * @see Element.ownText + */ + fun getElementsContainingOwnText(searchText: String): Elements { + return Collector.collect(Evaluator.ContainsOwnText(searchText), this) + } + + /** + * Find elements whose text matches the supplied regular expression. + * @param regex regular expression to match text against + * @return elements matching the supplied regular expression. + * @see Element.text + */ + fun getElementsMatchingText(regex: Regex): Elements { + return Collector.collect(Evaluator.Matches(regex), this) + } + + /** + * Find elements whose text matches the supplied regular expression. + * @param regex regular expression to match text against. You can use [embedded flags](http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded) (such as (?i) and (?m) to control regex options. + * @return elements matching the supplied regular expression. + * @see Element.text + */ + fun getElementsMatchingText(regex: String): Elements { + val pattern: Regex = try { + Regex(regex) + } catch (e: PatternSyntaxException) { + throw IllegalArgumentException("Pattern syntax error: $regex", e) + } + return getElementsMatchingText(pattern) + } + + /** + * Find elements whose own text matches the supplied regular expression. + * @param regex regular expression to match text against + * @return elements matching the supplied regular expression. + * @see Element.ownText + */ + fun getElementsMatchingOwnText(regex: Regex): Elements { + return Collector.collect(Evaluator.MatchesOwn(regex), this) + } + + /** + * Find elements whose own text matches the supplied regular expression. + * @param regex regular expression to match text against. You can use [embedded flags](http://java.sun.com/docs/books/tutorial/essential/regex/pattern.html#embedded) (such as (?i) and (?m) to control regex options. + * @return elements matching the supplied regular expression. + * @see Element.ownText + */ + fun getElementsMatchingOwnText(regex: String): Elements { + val pattern: Regex = try { + Regex(regex) + } catch (e: PatternSyntaxException) { + throw IllegalArgumentException("Pattern syntax error: $regex", e) + } + return getElementsMatchingOwnText(pattern) + } + + fun getAllElements(): Elements = + Collector.collect(Evaluator.AllElements(), this) + + /** + * Gets the **normalized, combined text** of this element and all its children. Whitespace is normalized and + * trimmed. + * + * For example, given HTML `

Hello there now!

`, `p.text()` returns `"Hello there + * now!"` + * + * If you do not want normalized text, use [.wholeText]. If you want just the text of this node (and not + * children), use [.ownText] + * + * Note that this method returns the textual content that would be presented to a reader. The contents of data + * nodes (such as ` Two" + val doc: Document = + DataUtil.parseInputSource( + this.stream(html), + "UTF-8", + "http://foo.com/", + Parser.htmlParser() + ) + assertEquals("One", doc.head().text()) + } + + @Test + fun discardsSpuriousByteOrderMarkWhenNoCharsetSet() { + val html = "\uFEFFOne Two" + val doc: Document = + DataUtil.parseInputSource( + this.stream(html), + null, + "http://foo.com/", + Parser.htmlParser() + ) + assertEquals("One", doc.head().text()) + assertEquals("UTF-8", doc.outputSettings().charset().name.uppercase()) + } + + @Test + fun shouldNotThrowExceptionOnEmptyCharset() { + assertNull(DataUtil.getCharsetFromContentType("text/html; charset=")) + assertNull(DataUtil.getCharsetFromContentType("text/html; charset=;")) + } + + @Test + fun shouldSelectFirstCharsetOnWeirdMultileCharsetsInMetaTags() { + assertEquals( + "ISO-8859-1", + DataUtil.getCharsetFromContentType("text/html; charset=ISO-8859-1, charset=1251") + ) + } + + @Test + fun shouldCorrectCharsetForDuplicateCharsetString() { + assertEquals( + "iso-8859-1", + DataUtil.getCharsetFromContentType("text/html; charset=charset=iso-8859-1") + ) + } + + @Test + fun shouldReturnNullForIllegalCharsetNames() { + assertNull(DataUtil.getCharsetFromContentType("text/html; charset=\$HJKDF§$/(")) + } + + @Test + fun generatesMimeBoundaries() { + val m1 = DataUtil.mimeBoundary() + val m2 = DataUtil.mimeBoundary() + assertEquals(DataUtil.boundaryLength, m1.length) + assertEquals(DataUtil.boundaryLength, m2.length) + assertNotSame(m1, m2) + } + + @Test + fun wrongMetaCharsetFallback() { + val html = " " + val doc: Document = + DataUtil.parseInputSource( + this.stream(html), + null, + "http://example.com", + Parser.htmlParser() + ) + val expected = """ + + + + +""" + assertEquals(expected, doc.toString()) + } + + @Test + @Throws(Exception::class) + fun secondMetaElementWithContentTypeContainsCharsetParameter() { + val html = "" + + "" + + "" + + " 한국어" + val doc: Document = + DataUtil.parseInputSource( + stream(html, "euc-kr"), + null, + "http://example.com", + Parser.htmlParser() + ) + assertEquals("한국어", doc.body().text()) + } + + @Test + @Throws(Exception::class) + fun firstMetaElementWithCharsetShouldBeUsedForDecoding() { + val html = "" + + "" + + "" + + " Übergrößenträger" + val doc: Document = + DataUtil.parseInputSource( + stream(html, "iso-8859-1"), + null, + "http://example.com", + Parser.htmlParser() + ) + assertEquals("Übergrößenträger", doc.body().text()) + } + + @Test + fun supportsBOMinFiles() { + // test files from http://www.i18nl10n.com/korean/utftest/ + var `in` = ParseTest.getResourceAbsolutePath("bomtests/bom_utf16be.html") + var doc: Document = + Ksoup.parseFile(file = `in`, baseUri = "http://example.com", charsetName = null) + assertTrue(doc.title().contains("UTF-16BE")) + assertTrue(doc.text().contains("가각갂갃간갅")) + `in` = ParseTest.getResourceAbsolutePath("bomtests/bom_utf16le.html") + doc = Ksoup.parseFile(file = `in`, baseUri = "http://example.com", charsetName = null) + assertTrue(doc.title().contains("UTF-16LE")) + assertTrue(doc.text().contains("가각갂갃간갅")) + `in` = ParseTest.getResourceAbsolutePath("bomtests/bom_utf32be.html") + doc = Ksoup.parseFile(file = `in`, baseUri = "http://example.com", charsetName = null) + assertTrue(doc.title().contains("UTF-32BE")) + assertTrue(doc.text().contains("가각갂갃간갅")) + `in` = ParseTest.getResourceAbsolutePath("bomtests/bom_utf32le.html") + doc = Ksoup.parseFile(file = `in`, baseUri = "http://example.com", charsetName = null) + assertTrue(doc.title().contains("UTF-32LE")) + assertTrue(doc.text().contains("가각갂갃간갅")) + } + + @Test + fun supportsUTF8BOM() { + val `in`: String = ParseTest.getResourceAbsolutePath("bomtests/bom_utf8.html") + val doc: Document = Ksoup.parseFile(`in`, "http://example.com", null) + assertEquals("OK", doc.head().select("title").text()) + } + + @Test + fun noExtraNULLBytes() { + val b = + "
üü
".toByteArray( + Charsets.UTF_8 + ) + val doc = Ksoup.parse(BufferReader(b), null, "") + assertFalse(doc.outerHtml().contains("\u0000")) + } + + @Test + fun supportsZippedUTF8BOM() { + val `in`: String = ParseTest.getResourceAbsolutePath("bomtests/bom_utf8.html.gz") + val doc: Document = Ksoup.parseFile( + file = `in`, + baseUri = "http://example.com", + charsetName = null + ) + assertEquals("OK", doc.head().select("title").text()) + assertEquals( + "There is a UTF8 BOM at the top (before the XML decl). If not read correctly, will look like a non-joining space.", + doc.body().text() + ) + } + + @Test + fun supportsXmlCharsetDeclaration() { + val encoding = "iso-8859-1" + val soup = BufferReader( + ("" + + "" + + "Hellö Wörld!").toByteArray( + Charset.forName(encoding) + ) + ) + val doc: Document = Ksoup.parse(soup, null, "") + assertEquals("Hellö Wörld!", doc.body().text()) + } + + @Test + fun lLoadsGzipFile() { + val `in`: String = ParseTest.getResourceAbsolutePath("htmltests/gzip.html.gz") + val doc: Document = Ksoup.parseFile(`in`, null) + doc.toString() + assertEquals("Gzip test", doc.title()) + assertEquals("This is a gzipped HTML file.", doc.selectFirst("p")!!.text()) + } + + @Test + fun loadsZGzipFile() { + // compressed on win, with z suffix + val `in`: String = ParseTest.getResourceAbsolutePath("htmltests/gzip.html.z") + val doc: Document = Ksoup.parseFile(`in`, null) + assertEquals("Gzip test", doc.title()) + assertEquals("This is a gzipped HTML file.", doc.selectFirst("p")!!.text()) + } + + @Test + fun handlesFakeGzipFile() { + val `in`: String = ParseTest.getResourceAbsolutePath("htmltests/fake-gzip.html.gz") + val doc: Document = Ksoup.parseFile(`in`, null) + assertEquals("This is not gzipped", doc.title()) + assertEquals("And should still be readable.", doc.selectFirst("p")!!.text()) + } + + @Test + fun handlesChunkedInputStream() { + val inputFile: String = ParseTest.getResourceAbsolutePath("htmltests/large.html") + val input: String = ParseTest.getFileAsString(inputFile.toPath()) +// val stream = VaryingBufferReader(BufferReader(input)) + val expected = Ksoup.parse(input, "https://example.com") + val doc: Document = Ksoup.parse(BufferReader(input), null, "https://example.com") + + println("""docSize: ${doc.toString().length}, expectedSize: ${expected.toString().length}""") + assertTrue(doc.hasSameValue(expected)) + } + + @Test + fun handlesUnlimitedRead() { + val inputFile: String = ParseTest.getResourceAbsolutePath("htmltests/large.html") + val input: String = ParseTest.getFileAsString(inputFile.toPath()) +// val stream = VaryingBufferReader(BufferReader(input)) +// val byteBuffer: BufferReader = DataUtil.readToByteBuffer(stream, 0) + val byteBuffer: BufferReader = DataUtil.readToByteBuffer(BufferReader(input), 0) + val read = byteBuffer.readByteArray().decodeToString() + assertEquals(input, read) + } +} diff --git a/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/integration/ParseTest.kt b/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/integration/ParseTest.kt new file mode 100644 index 00000000..040590e0 --- /dev/null +++ b/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/integration/ParseTest.kt @@ -0,0 +1,184 @@ +package com.fleeksoft.ksoup.integration + +import com.fleeksoft.ksoup.Ksoup.parse +import com.fleeksoft.ksoup.Ksoup.parseFile +import com.fleeksoft.ksoup.nodes.Document +import com.fleeksoft.ksoup.parser.Parser +import com.fleeksoft.ksoup.ported.BufferReader +import com.fleeksoft.ksoup.readFile +import com.fleeksoft.ksoup.readGzipFile +import io.ktor.utils.io.charsets.name +import okio.Path +import okio.Path.Companion.toPath +import kotlin.test.Test +import kotlin.test.assertEquals +import kotlin.test.assertNotEquals +import kotlin.test.assertTrue + +/** + * Integration test: parses from real-world example HTML. + * + * @author Sabeeh, fleeksoft@gmail.com + */ +class ParseTest { + @Test + fun testHtml5Charset() { + // test that works + var `in` = getResourceAbsolutePath("htmltests/meta-charset-1.html") + var doc: Document = + parseFile( + file = `in`, + baseUri = "http://example.com/", + charsetName = null + ) //gb2312, has html5 + assertEquals("新", doc.text()) + assertEquals("GB2312", doc.outputSettings().charset().name.uppercase()) + + // double check, no charset, falls back to utf8 which is incorrect + `in` = getResourceAbsolutePath("htmltests/meta-charset-2.html") // + doc = parseFile( + file = `in`, + baseUri = "http://example.com", + charsetName = null + ) // gb2312, no charset + assertEquals("UTF-8", doc.outputSettings().charset().name.uppercase()) + assertNotEquals("新", doc.text()) + + // confirm fallback to utf8 + `in` = getResourceAbsolutePath("htmltests/meta-charset-3.html") + doc = parseFile( + file = `in`, + baseUri = "http://example.com/", + charsetName = null + ) // utf8, no charset + assertEquals("UTF-8", doc.outputSettings().charset().name.uppercase()) + assertEquals("新", doc.text()) + } + + @Test + fun testBrokenHtml5CharsetWithASingleDoubleQuote() { + val `in` = BufferReader( + """ + + + + + """.trimIndent() + ) + + val doc: Document = parse(`in`, null, "http://example.com/") + assertEquals("UTF-8", doc.outputSettings().charset().name.uppercase()) + } + + @Test + fun testLowercaseUtf8Charset() { + val `in` = getResourceAbsolutePath("htmltests/lowercase-charset-test.html") + val doc: Document = parseFile(`in`, null) + val form = doc.select("#form").first() + assertEquals(2, form!!.children().size) + assertEquals("UTF-8", doc.outputSettings().charset().name.uppercase()) + } + + @Test + fun testXwiki() { + // https://github.com/jhy/jsoup/issues/1324 + // this tests that when in CharacterReader we hit a buffer while marked, we preserve the mark when buffered up and can rewind + val `in` = getResourceAbsolutePath("htmltests/xwiki-1324.html.gz") + val doc: Document = parseFile( + file = `in`, + baseUri = "https://localhost/", + charsetName = null + ) + assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()) + + // was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so + // updated to preserve the mark. + val wantHtml = + "User Directory" + assertEquals(wantHtml, doc.select("[data-id=userdirectory]").outerHtml()) + } + + @Test + fun testXwikiExpanded() { + // https://github.com/jhy/jsoup/issues/1324 + // this tests that if there is a huge illegal character reference, we can get through a buffer and rewind, and still catch that it's an invalid refence, + // and the parse tree is correct. + + val parser = Parser.htmlParser() + val doc = parse( + resourceFilePathToBufferReader("htmltests/xwiki-edit.html.gz"), + "UTF-8", + "https://localhost/", + parser.setTrackErrors(100) + ) + val errors = parser.getErrors() + assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()) + assertEquals(0, errors.size) // not an invalid reference because did not look legit + + // was getting busted at =userdirectory, because it hit the bufferup point but the mark was then lost. so + // updated to preserve the mark. + val wantHtml = + "User Directory" + assertEquals(wantHtml, doc.select("[data-id=userdirectory]").outerHtml()) + } + + @Test + fun testFileParseNoCharsetMethod() { + val file = getResourceAbsolutePath("htmltests/xwiki-1324.html.gz") + val doc: Document = parseFile(file) + assertEquals("XWiki Jetty HSQLDB 12.1-SNAPSHOT", doc.select("#xwikiplatformversion").text()) + } + + companion object { + + fun getResourceAbsolutePath(resourceName: String): String { +// /Users/sabeeh/AndroidStudioProjects/ksoup/ksoup/src/commonTest/resources/ +// return "src/commonTest/resources/$resourceName".toPath() + return "/Users/sabeeh/AndroidStudioProjects/ksoup/ksoup/src/commonTest/resources/$resourceName" + } + + fun getFileAsString(file: Path): String { + val bytes: ByteArray = if (file.name.endsWith(".gz")) { + readGzipFile(file).readByteArray() + } else { + readFile(file).readByteArray() + } + return bytes.decodeToString() + } + + fun resourceFilePathToBufferReader(path: String): BufferReader { + val file = this.getResourceAbsolutePath(path) + return pathToBufferReader(file.toPath()) + } + + fun pathToBufferReader(file: Path): BufferReader { + return if (file.name.endsWith(".gz")) { + BufferReader(readGzipFile(file).readByteArray()) + } else { + BufferReader(readFile(file).readByteArray()) + } + } + } +} diff --git a/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/integration/SafelistExtensionTest.kt b/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/integration/SafelistExtensionTest.kt new file mode 100644 index 00000000..300a7734 --- /dev/null +++ b/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/integration/SafelistExtensionTest.kt @@ -0,0 +1,35 @@ +package com.fleeksoft.ksoup.integration + +import com.fleeksoft.ksoup.Ksoup +import com.fleeksoft.ksoup.nodes.Attribute +import com.fleeksoft.ksoup.nodes.Element +import com.fleeksoft.ksoup.safety.Safelist +import kotlin.test.Test +import kotlin.test.assertEquals + +/** + * Check that we can extend Safelist methods + */ +class SafelistExtensionTest { + @Test + fun canCustomizeSafeTests() { + val openSafelist = OpenSafelist(Safelist.relaxed()) + val safelist = Safelist.relaxed() + val html = "

Hello

" + val openClean = Ksoup.clean(html, openSafelist) + val clean = Ksoup.clean(html, safelist) + assertEquals("

Hello

", com.fleeksoft.ksoup.TextUtil.stripNewlines(openClean)) + assertEquals("

Hello

", clean) + } + + // passes tags and attributes starting with "open" + private class OpenSafelist(safelist: Safelist?) : Safelist(safelist!!) { + override fun isSafeAttribute(tagName: String, el: Element, attr: Attribute): Boolean { + return if (attr.key.startsWith("open")) true else super.isSafeAttribute(tagName, el, attr) + } + + override fun isSafeTag(tag: String): Boolean { + return if (tag.startsWith("open")) true else super.isSafeTag(tag) + } + } +} diff --git a/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/internal/StringUtilTest.kt b/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/internal/StringUtilTest.kt new file mode 100644 index 00000000..6c21b4c2 --- /dev/null +++ b/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/internal/StringUtilTest.kt @@ -0,0 +1,185 @@ +package com.fleeksoft.ksoup.internal + +import com.fleeksoft.ksoup.Ksoup.parse +import com.fleeksoft.ksoup.internal.StringUtil.isAscii +import com.fleeksoft.ksoup.internal.StringUtil.isBlank +import com.fleeksoft.ksoup.internal.StringUtil.isNumeric +import com.fleeksoft.ksoup.internal.StringUtil.isWhitespace +import com.fleeksoft.ksoup.internal.StringUtil.join +import com.fleeksoft.ksoup.internal.StringUtil.normaliseWhitespace +import com.fleeksoft.ksoup.internal.StringUtil.padding +import kotlin.test.assertEquals +import kotlin.test.Test +import kotlin.test.assertFalse +import kotlin.test.assertTrue + +class StringUtilTest { + @Test + fun join() { + assertEquals("", join(listOf(""), " ")) + assertEquals("one", join(listOf("one"), " ")) + assertEquals("one two three", join(mutableListOf("one", "two", "three"), " ")) + } + + @Test + fun padding() { + assertEquals("", padding(0)) + assertEquals(" ", padding(1)) + assertEquals(" ", padding(2)) + assertEquals(" ", padding(15)) + assertEquals(" ", padding(45)) // we default to tap out at 30 + + // memoization is up to 21 blocks (0 to 20 spaces) and exits early before min checks making maxPaddingWidth unused + assertEquals("", padding(0, -1)) + assertEquals(" ", padding(20, -1)) + + // this test escapes memoization and continues through + assertEquals(" ", padding(21, -1)) + + // this test escapes memoization and using unlimited length (-1) will allow requested spaces + assertEquals(" ", padding(30, -1)) + assertEquals(" ", padding(45, -1)) + + // we tap out at 0 for this test + assertEquals("", padding(0, 0)) + + // as memoization is escaped, setting zero for max padding will not allow any requested width + assertEquals("", padding(21, 0)) + + // we tap out at 30 for these tests making > 30 use 30 + assertEquals("", padding(0, 30)) + assertEquals(" ", padding(1, 30)) + assertEquals(" ", padding(2, 30)) + assertEquals(" ", padding(15, 30)) + assertEquals(" ", padding(45, 30)) + + // max applies regardless of memoized + assertEquals(5, padding(20, 5).length) + } + + @Test + fun paddingInACan() { + val padding = padding + assertEquals(21, padding.size) + for (i in padding.indices) { + assertEquals(i, padding[i].length) + } + } + + @Test + fun isBlank() { + assertTrue(isBlank(null)) + assertTrue(isBlank("")) + assertTrue(isBlank(" ")) + assertTrue(isBlank(" \r\n ")) + assertFalse(isBlank("hello")) + assertFalse(isBlank(" hello ")) + } + + @Test + fun isNumeric() { + assertFalse(isNumeric(null)) + assertFalse(isNumeric(" ")) + assertFalse(isNumeric("123 546")) + assertFalse(isNumeric("hello")) + assertFalse(isNumeric("123.334")) + assertTrue(isNumeric("1")) + assertTrue(isNumeric("1234")) + } + + @Test + fun isWhitespace() { + assertTrue(isWhitespace('\t'.code)) + assertTrue(isWhitespace('\n'.code)) + assertTrue(isWhitespace('\r'.code)) + assertTrue(isWhitespace('\u000c'.code)) + assertTrue(isWhitespace(' '.code)) + assertFalse(isWhitespace('\u00a0'.code)) + assertFalse(isWhitespace('\u2000'.code)) + assertFalse(isWhitespace('\u3000'.code)) + } + + @Test + fun normaliseWhiteSpace() { + assertEquals(" ", normaliseWhitespace(" \r \n \r\n")) + assertEquals(" hello there ", normaliseWhitespace(" hello \r \n there \n")) + assertEquals("hello", normaliseWhitespace("hello")) + assertEquals("hello there", normaliseWhitespace("hello\nthere")) + } + + @Test + fun normaliseWhiteSpaceHandlesHighSurrogates() { + val test71540chars = "\ud869\udeb2\u304b\u309a 1" + val test71540charsExpectedSingleWhitespace = "\ud869\udeb2\u304b\u309a 1" + assertEquals(test71540charsExpectedSingleWhitespace, normaliseWhitespace(test71540chars)) + val extractedText = parse(test71540chars).text() + assertEquals(test71540charsExpectedSingleWhitespace, extractedText) + } + + @Test + fun resolvesRelativeUrls() { + assertEquals("http://example.com/one/two?three", StringUtil.resolve("http://example.com", "./one/two?three")) + assertEquals( + "http://example.com/one/two?three", + StringUtil.resolve("http://example.com?one", "./one/two?three") + ) + assertEquals( + "http://example.com/one/two?three#four", + StringUtil.resolve("http://example.com", "./one/two?three#four") + ) + assertEquals("https://example.com/one", StringUtil.resolve("http://example.com/", "https://example.com/one")) + assertEquals( + "http://example.com/one/two.html", + StringUtil.resolve("http://example.com/two/", "../one/two.html") + ) + assertEquals("https://example2.com/one", StringUtil.resolve("https://example.com/", "//example2.com/one")) + assertEquals("https://example.com:8080/one", StringUtil.resolve("https://example.com:8080", "./one")) + assertEquals("https://example2.com/one", StringUtil.resolve("http://example.com/", "https://example2.com/one")) + assertEquals("https://example.com/one", StringUtil.resolve("wrong", "https://example.com/one")) + assertEquals("https://example.com/one", StringUtil.resolve("https://example.com/one", "")) + assertEquals("", StringUtil.resolve("wrong", "also wrong")) + assertEquals("ftp://example.com/one", StringUtil.resolve("ftp://example.com/two/", "../one")) + assertEquals("ftp://example.com/one/two.c", StringUtil.resolve("ftp://example.com/one/", "./two.c")) + assertEquals("ftp://example.com/one/two.c", StringUtil.resolve("ftp://example.com/one/", "two.c")) + // examples taken from rfc3986 section 5.4.2 + assertEquals("http://example.com/g", StringUtil.resolve("http://example.com/b/c/d;p?q", "../../../g")) + assertEquals("http://example.com/g", StringUtil.resolve("http://example.com/b/c/d;p?q", "../../../../g")) + assertEquals("http://example.com/g", StringUtil.resolve("http://example.com/b/c/d;p?q", "/./g")) + assertEquals("http://example.com/g", StringUtil.resolve("http://example.com/b/c/d;p?q", "/../g")) + assertEquals("http://example.com/b/c/g.", StringUtil.resolve("http://example.com/b/c/d;p?q", "g.")) + assertEquals("http://example.com/b/c/.g", StringUtil.resolve("http://example.com/b/c/d;p?q", ".g")) + assertEquals("http://example.com/b/c/g..", StringUtil.resolve("http://example.com/b/c/d;p?q", "g..")) + assertEquals("http://example.com/b/c/..g", StringUtil.resolve("http://example.com/b/c/d;p?q", "..g")) + assertEquals("http://example.com/b/g", StringUtil.resolve("http://example.com/b/c/d;p?q", "./../g")) + assertEquals("http://example.com/b/c/g/", StringUtil.resolve("http://example.com/b/c/d;p?q", "./g/.")) + assertEquals("http://example.com/b/c/g/h", StringUtil.resolve("http://example.com/b/c/d;p?q", "g/./h")) + assertEquals("http://example.com/b/c/h", StringUtil.resolve("http://example.com/b/c/d;p?q", "g/../h")) + assertEquals("http://example.com/b/c/g;x=1/y", StringUtil.resolve("http://example.com/b/c/d;p?q", "g;x=1/./y")) + assertEquals("http://example.com/b/c/y", StringUtil.resolve("http://example.com/b/c/d;p?q", "g;x=1/../y")) + assertEquals("http://example.com/b/c/g?y/./x", StringUtil.resolve("http://example.com/b/c/d;p?q", "g?y/./x")) + assertEquals("http://example.com/b/c/g?y/../x", StringUtil.resolve("http://example.com/b/c/d;p?q", "g?y/../x")) + assertEquals("http://example.com/b/c/g#s/./x", StringUtil.resolve("http://example.com/b/c/d;p?q", "g#s/./x")) + assertEquals("http://example.com/b/c/g#s/../x", StringUtil.resolve("http://example.com/b/c/d;p?q", "g#s/../x")) + } + + @Test + fun stripsControlCharsFromUrls() { + // in java URL return exception when URL(URL(https://example.com), "foo:bar) + assertEquals("https://example.com/foo:bar", StringUtil.resolve("\nhttps://\texample.com/", "\r\nfo\to:ba\br")) + } + + @Test + fun allowsSpaceInUrl() { + assertEquals("https://example.com/foo bar/", StringUtil.resolve("HTTPS://example.com/example/", "../foo bar/")) + } + + @Test + fun isAscii() { + assertTrue(isAscii("")) + assertTrue(isAscii("example.com")) + assertTrue(isAscii("One Two")) + assertFalse(isAscii("🧔")) + assertFalse(isAscii("测试")) + assertFalse(isAscii("测试.com")) + } +} diff --git a/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/nodes/AttributeTest.kt b/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/nodes/AttributeTest.kt new file mode 100644 index 00000000..16c02518 --- /dev/null +++ b/ksoup/src/commonTest/kotlin/com/fleeksoft/ksoup/nodes/AttributeTest.kt @@ -0,0 +1,94 @@ +package com.fleeksoft.ksoup.nodes + +import com.fleeksoft.ksoup.Ksoup.parse +import com.fleeksoft.ksoup.parser.ParseSettings +import com.fleeksoft.ksoup.parser.Parser +import de.cketti.codepoints.deluxe.toCodePoint +import kotlin.test.* + +class AttributeTest { + @Test + fun html() { + val attr = Attribute("key", "value &") + assertEquals("key=\"value &\"", attr.html()) + assertEquals(attr.html(), attr.toString()) + } + + @Test + fun testWithSupplementaryCharacterInAttributeKeyAndValue() { + val s = 135361.toCodePoint().toChars().concatToString() + val attr = Attribute(s, "A" + s + "B") + assertEquals(s + "=\"A" + s + "B\"", attr.html()) + assertEquals(attr.html(), attr.toString()) + } + + @Test + fun validatesKeysNotEmpty() { + assertFailsWith { Attribute(" ", "Check") } + } + + @Test + fun validatesKeysNotEmptyViaSet() { + assertFailsWith { + val attr = Attribute("One", "Check") + attr.setKey(" ") + } + } + + @Test + fun booleanAttributesAreEmptyStringValues() { + val doc = parse("