Skip to content

Commit aac7441

Browse files
committed
ExtractorHTML: Fix srcset by normalizing elementContext() to lowercase
This ensures that when we later compare the context in processEmbed() we don't need to deal with variants like srcSet or SRCSET. Note that we're already sometimes lowercasing it later in HTMLLinkContext.get(). Fixes internetarchive#477.
1 parent 55078c0 commit aac7441

File tree

2 files changed

+6
-4
lines changed

2 files changed

+6
-4
lines changed

modules/src/main/java/org/archive/modules/extractor/ExtractorHTML.java

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1090,7 +1090,7 @@ protected void processStyle(CrawlURI curi, CharSequence sequence,
10901090
* @return CharSequence context
10911091
*/
10921092
public static CharSequence elementContext(CharSequence element, CharSequence attribute) {
1093-
return attribute == null? "": element + "/@" + attribute;
1093+
return attribute == null? "": (element + "/@" + attribute).toLowerCase(Locale.ROOT);
10941094
}
10951095
}
10961096

modules/src/test/java/org/archive/modules/extractor/ExtractorHTMLTest.java

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -545,8 +545,8 @@ public void testSourceSrcSetAttribute() throws URIException {
545545

546546
CharSequence cs = "<picture>"
547547
+ "<source media=\"(min-width: 992px)\" srcset=\"images/foo1.jpg\"> "
548-
+ "<source media=\"(min-width: 500px)\" srcset=\"images/foo2.jpg\"> "
549-
+ "<source media=\"(min-width: 0px)\" srcset=\"images/foo3.jpg\"> "
548+
+ "<source media=\"(min-width: 500px)\" SRCSET=\"images/foo2.jpg\"> "
549+
+ "<source media=\"(min-width: 0px)\" srcSet=\"images/foo3-1x.jpg 1x, images/foo3-2x.jpg 2x\"> "
550550
+ "<img src=\"images/foo.jpg\" alt=\"\"> "
551551
+ "</picture>";
552552

@@ -559,7 +559,9 @@ public void testSourceSrcSetAttribute() throws URIException {
559559
"http://www.example.com/images/foo.jpg",
560560
"http://www.example.com/images/foo1.jpg",
561561
"http://www.example.com/images/foo2.jpg",
562-
"http://www.example.com/images/foo3.jpg" };
562+
"http://www.example.com/images/foo3-1x.jpg",
563+
"http://www.example.com/images/foo3-2x.jpg",
564+
};
563565

564566
for (int i = 0; i < links.length; i++) {
565567
assertEquals("outlink from picture", dest[i], links[i].getURI());

0 commit comments

Comments
 (0)