Skip to content

Commit 4ea768d

Browse files
committed
Strip control characters from URLs when resolving absolute URLs
1 parent 985f1fe commit 4ea768d

File tree

3 files changed

+36
-1
lines changed

3 files changed

+36
-1
lines changed

src/main/java/org/jsoup/internal/StringUtil.java

+9-1
Original file line numberDiff line numberDiff line change
@@ -290,6 +290,7 @@ public static boolean isAscii(String string) {
290290
* @throws MalformedURLException if an error occurred generating the URL
291291
*/
292292
public static URL resolve(URL base, String relUrl) throws MalformedURLException {
293+
relUrl = stripControlChars(relUrl);
293294
// workaround: java resolves '//path/file + ?foo' to '//path/?foo', not '//path/file?foo' as desired
294295
if (relUrl.startsWith("?"))
295296
relUrl = base.getPath() + relUrl;
@@ -308,7 +309,9 @@ public static URL resolve(URL base, String relUrl) throws MalformedURLException
308309
* @param relUrl the relative URL to resolve. (If it's already absolute, it will be returned)
309310
* @return an absolute URL if one was able to be generated, or the empty string if not
310311
*/
311-
public static String resolve(final String baseUrl, final String relUrl) {
312+
public static String resolve(String baseUrl, String relUrl) {
313+
// workaround: java will allow control chars in a path URL and may treat as relative, but Chrome / Firefox will strip and may see as a scheme. Normalize to browser's view.
314+
baseUrl = stripControlChars(baseUrl); relUrl = stripControlChars(relUrl);
312315
try {
313316
URL base;
314317
try {
@@ -327,6 +330,11 @@ public static String resolve(final String baseUrl, final String relUrl) {
327330
}
328331
private static final Pattern validUriScheme = Pattern.compile("^[a-zA-Z][a-zA-Z0-9+-.]*:");
329332

333+
private static final Pattern controlChars = Pattern.compile("[\\x00-\\x1f]*"); // matches ascii 0 - 31, to strip from url
334+
private static String stripControlChars(final String input) {
335+
return controlChars.matcher(input).replaceAll("");
336+
}
337+
330338
private static final ThreadLocal<Stack<StringBuilder>> threadLocalBuilders = new ThreadLocal<Stack<StringBuilder>>() {
331339
@Override
332340
protected Stack<StringBuilder> initialValue() {

src/test/java/org/jsoup/internal/StringUtilTest.java

+9
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,15 @@ public void join() {
147147
assertEquals("http://example.com/b/c/g#s/../x", resolve("http://example.com/b/c/d;p?q", "g#s/../x"));
148148
}
149149

150+
@Test void stripsControlCharsFromUrls() {
151+
// should resovle to an absolute url:
152+
assertEquals("foo:bar", resolve("\nhttps://\texample.com/", "\r\nfo\to:ba\br"));
153+
}
154+
155+
@Test void allowsSpaceInUrl() {
156+
assertEquals("https://example.com/foo bar/", resolve("HTTPS://example.com/example/", "../foo bar/"));
157+
}
158+
150159
@Test
151160
void isAscii() {
152161
assertTrue(StringUtil.isAscii(""));

src/test/java/org/jsoup/safety/CleanerTest.java

+18
Original file line numberDiff line numberDiff line change
@@ -213,6 +213,24 @@ public void safeListedProtocolShouldBeRetained(Locale locale) {
213213
assertEquals("<a rel=\"nofollow\">Link</a>", clean);
214214
}
215215

216+
@Test void dropsConcealedJavascriptProtocolWhenRelativesLinksEnabled() {
217+
Safelist safelist = Safelist.basic().preserveRelativeLinks(true);
218+
String html = "<a href=\"&#0013;ja&Tab;va&Tab;script&#0010;:alert(1)\">Link</a>";
219+
String clean = Jsoup.clean(html, "https://", safelist);
220+
assertEquals("<a rel=\"nofollow\">Link</a>", clean);
221+
222+
String colon = "<a href=\"ja&Tab;va&Tab;script&colon;alert(1)\">Link</a>";
223+
String cleanColon = Jsoup.clean(colon, "https://", safelist);
224+
assertEquals("<a rel=\"nofollow\">Link</a>", cleanColon);
225+
}
226+
227+
@Test void dropsConcealedJavascriptProtocolWhenRelativesLinksDisabled() {
228+
Safelist safelist = Safelist.basic().preserveRelativeLinks(false);
229+
String html = "<a href=\"ja&Tab;vas&#0013;cript:alert(1)\">Link</a>";
230+
String clean = Jsoup.clean(html, "https://", safelist);
231+
assertEquals("<a rel=\"nofollow\">Link</a>", clean);
232+
}
233+
216234
@Test public void handlesCustomProtocols() {
217235
String html = "<img src='cid:12345' /> <img src='data:gzzt' />";
218236
String dropped = Jsoup.clean(html, Safelist.basicWithImages());

0 commit comments

Comments
 (0)