Skip to content

Commit

Permalink
feat: resolving relative pathes using base url
Browse files Browse the repository at this point in the history
  • Loading branch information
loyd committed Mar 25, 2017
1 parent 2fdf802 commit b215a7e
Show file tree
Hide file tree
Showing 2 changed files with 49 additions and 8 deletions.
1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,3 +8,4 @@ kuchiki = "^0.4.1"
lazy_static = "^0.2.1"
regex = "^0.2.1"
html5ever-atoms = "^0.1.3"
url = "^1.4.0"
56 changes: 48 additions & 8 deletions src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -4,16 +4,18 @@ extern crate html5ever_atoms;
extern crate lazy_static;
extern crate kuchiki;
extern crate regex;
extern crate url;

use std::cmp;
use std::iter;
use std::f32;

use regex::Regex;
use html5ever_atoms::QualName;
use kuchiki::{NodeRef, NodeDataRef, NodeData, ElementData};
use kuchiki::{NodeRef, NodeDataRef, NodeData, ElementData, Attributes};
use kuchiki::traits::TendrilSink;
use kuchiki::iter::NodeIterator;
use url::Url;

use node_cache::NodeCache;

Expand Down Expand Up @@ -92,6 +94,8 @@ lazy_static! {
static ref VIDEO: Regex = Regex::new(r"(?xi)
//(www\.)?(dailymotion|youtube|youtube-nocookie|player\.vimeo)\.com
").unwrap();

static ref PROTOCOL: Regex = Regex::new(r"^\w+:").unwrap();
}

macro_rules! tag {
Expand Down Expand Up @@ -246,13 +250,32 @@ fn is_stuffed(elem: &ElemRef, info: &NodeInfo) -> bool {
info.text_len > 0 || info.img_count + info.embed_count > 0
}

fn clean_attributes(elem: &ElemRef) {
let mut attributes = elem.attributes.borrow_mut();

fn clean_attributes(attributes: &mut Attributes) {
//#TODO: what about removing all except for `alt`, `href`, `src` and `title`?
attributes.remove(attrib!("style"));
}

fn fix_relative_urls(attributes: &mut Attributes, base_url: &Url) {
fn fix(url: &mut String, base: &Url) {
// Ignore absolute and hash urls.
if PROTOCOL.is_match(url) || url.starts_with('#') {
return;
}

if let Ok(resolved) = base.join(&url) {
*url = resolved.into_string();
}
}

if let Some(attr) = attributes.get_mut(attrib!("href")) {
fix(attr, base_url);
}

if let Some(attr) = attributes.get_mut(attrib!("src")) {
fix(attr, base_url);
}
}

#[derive(Default, PartialEq, Clone)]
struct NodeInfo {
content_score: f32,
Expand All @@ -275,6 +298,7 @@ pub struct Readability {
weight_classes: bool,
clean_conditionally: bool,
clean_attributes: bool,
base_url: Option<Url>
}

impl Readability {
Expand All @@ -287,6 +311,7 @@ impl Readability {
weight_classes: true,
clean_conditionally: true,
clean_attributes: true,
base_url: None,
}
}

Expand All @@ -310,12 +335,19 @@ impl Readability {
self
}

pub fn parse(self, html: &str) -> NodeRef {
pub fn base_url<U>(&mut self, url: U) -> &mut Self
where U: Into<Option<Url>>
{
self.base_url = url.into();
self
}

pub fn parse(&mut self, html: &str) -> NodeRef {
let top_level = kuchiki::parse_html().one(html);
self.readify(top_level)
}

fn readify(mut self, top_level: NodeRef) -> NodeRef {
fn readify(&mut self, top_level: NodeRef) -> NodeRef {
let mut current = top_level.clone();
let mut bubbling = false;

Expand Down Expand Up @@ -398,7 +430,7 @@ impl Readability {
parent_info.text_len += char_cnt;
parent_info.commas += comma_cnt;
},
NodeData::Element(ElementData { ref name, .. }) => {
NodeData::Element(ElementData { ref name, ref attributes, .. }) => {
self.propagate_info(node);

if is_tag_to_score(name) {
Expand All @@ -416,8 +448,16 @@ impl Readability {
return;
}

let mut attributes = attributes.borrow_mut();

if self.clean_attributes {
clean_attributes(&elem);
clean_attributes(&mut *attributes);
}

if let Some(ref base_url) = self.base_url {
if *name == tag!("a") || *name == tag!("img") {
fix_relative_urls(&mut *attributes, base_url);
}
}
},
_ => {}
Expand Down

0 comments on commit b215a7e

Please sign in to comment.