Skip to content

Commit

Permalink
Ignore textContent links in html nodes (#1528)
Browse files Browse the repository at this point in the history
This fixes issue #1462 by removing plaintext URI parsing
in html5ever and pruning attribute-less URIs in html5gum
  • Loading branch information
markogalevski authored Oct 13, 2024
1 parent 4ac7658 commit 2a9f11a
Show file tree
Hide file tree
Showing 3 changed files with 59 additions and 15 deletions.
10 changes: 9 additions & 1 deletion lychee-lib/src/collector.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,14 @@ mod tests {
responses.map(|r| r.unwrap().uri).collect().await
}

// Helper function for collecting verbatim links
async fn collect_verbatim(inputs: Vec<Input>, base: Option<Base>) -> HashSet<Uri> {
let responses = Collector::new(base)
.include_verbatim(true)
.collect_links(inputs);
responses.map(|r| r.unwrap().uri).collect().await
}

const TEST_STRING: &str = "http://test-string.com";
const TEST_URL: &str = "https://test-url.org";
const TEST_FILE: &str = "https://test-file.io";
Expand Down Expand Up @@ -233,7 +241,7 @@ mod tests {
},
];

let links = collect(inputs, None).await;
let links = collect_verbatim(inputs, None).await;

let expected_links = HashSet::from_iter([
website(TEST_STRING),
Expand Down
23 changes: 20 additions & 3 deletions lychee-lib/src/extract/html/html5ever.rs
Original file line number Diff line number Diff line change
Expand Up @@ -28,9 +28,11 @@ impl TokenSink for LinkExtractor {
if self.current_verbatim_element_name.borrow().is_some() {
return TokenSinkResult::Continue;
}
self.links
.borrow_mut()
.extend(extract_raw_uri_from_plaintext(&raw));
if self.include_verbatim {
self.links
.borrow_mut()
.extend(extract_raw_uri_from_plaintext(&raw));
}
}
Token::TagToken(tag) => {
let Tag {
Expand Down Expand Up @@ -414,6 +416,21 @@ mod tests {
assert!(uris.is_empty());
}

#[test]
fn test_ignore_text_content_links() {
let input = r#"
<a href="https://example.com">https://ignoreme.com</a>
"#;
let expected = vec![RawUri {
text: "https://example.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];

let uris = extract_html(input, false);
assert_eq!(uris, expected);
}

#[test]
fn test_skip_dns_prefetch() {
let input = r#"
Expand Down
41 changes: 30 additions & 11 deletions lychee-lib/src/extract/html/html5gum.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,15 @@ use std::collections::{HashMap, HashSet};
use super::{is_email_link, is_verbatim_elem, srcset};
use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw::RawUri};

#[derive(Clone, Default, Debug)]
struct Element {
/// Current element name being processed.
/// This is called a tag in html5gum.
name: String,
/// Whether the current element is a closing tag.
is_closing: bool,
}

/// Extract links from HTML documents.
///
/// This is the main driver for the html5gum tokenizer.
Expand All @@ -16,7 +25,7 @@ use crate::{extract::plaintext::extract_raw_uri_from_plaintext, types::uri::raw:
///
/// The `links` vector contains all links extracted from the HTML document and
/// the `fragments` set contains all fragments extracted from the HTML document.
#[derive(Clone, Default)]
#[derive(Clone, Default, Debug)]
struct LinkExtractor {
/// Links extracted from the HTML document.
links: Vec<RawUri>,
Expand All @@ -39,15 +48,6 @@ struct LinkExtractor {
verbatim_stack: Vec<String>,
}

#[derive(Clone, Default)]
struct Element {
/// Current element name being processed.
/// This is called a tag in html5gum.
name: String,
/// Whether the current element is a closing tag.
is_closing: bool,
}

impl LinkExtractor {
/// Create a new `LinkExtractor`.
///
Expand Down Expand Up @@ -326,7 +326,11 @@ pub(crate) fn extract_html(buf: &str, include_verbatim: bool) -> Vec<RawUri> {
let mut extractor = LinkExtractor::new(include_verbatim);
let mut tokenizer = Tokenizer::new_with_emitter(buf, &mut extractor).infallible();
assert!(tokenizer.next().is_none());
extractor.links
extractor
.links
.into_iter()
.filter(|link| link.attribute.is_some() || include_verbatim)
.collect()
}

/// Extract fragments from id attributes within a HTML string.
Expand Down Expand Up @@ -609,6 +613,21 @@ mod tests {
assert!(uris.is_empty());
}

#[test]
fn test_ignore_text_content_links() {
let input = r#"
<a href="https://example.com">https://ignoreme.com</a>
"#;
let expected = vec![RawUri {
text: "https://example.com".to_string(),
element: Some("a".to_string()),
attribute: Some("href".to_string()),
}];

let uris = extract_html(input, false);
assert_eq!(uris, expected);
}

#[test]
fn test_skip_dns_prefetch() {
let input = r#"
Expand Down

0 comments on commit 2a9f11a

Please sign in to comment.