From 1b1fd0c7074a62a8e0addfe329bb77ff5f08c440 Mon Sep 17 00:00:00 2001 From: Techassi Date: Sun, 17 Sep 2023 21:39:01 +0200 Subject: [PATCH] feat: Add support for ranges in the `--accept` option / config field (#1167) Adds support for accept ranges discussed in #1157. This allows the user to specify custom HTTP status codes accepted during checking and thus will report as valid (not broken). The accept option only supports specifying status codes as a comma-separated list. With this PR, the option will accept a list of status code ranges formatted like this: ```toml accept = ["100..=103", "200..=299", "403"] ``` These combinations will be supported: `..`, ` ..=`, `..` and `..=`. The behavior is copied from the Rust Range like concepts: ``` .., includes 0 to (exclusive) ..=, includes 0 to (inclusive) .., includes to (exclusive) ..=, includes to (inclusive) ``` - Foundation and enhancements for accept ranges, including support for comma-separated strings and integration into the CLI. - Implementations and updates for AcceptSelector, including Default, Display, and serde defaults. - Address and fix various errors: clippy, cargo fmt, and tests. - Add more tests, address edge cases, and enhance error messaging, especially for TOML config parsing. - Update dependencies. --- Cargo.lock | 52 ++++++ README.md | 38 +++- fixtures/configs/smoketest.toml | 8 +- lychee-bin/src/client.rs | 17 +- lychee-bin/src/commands/check.rs | 8 +- lychee-bin/src/options.rs | 37 +++- lychee-bin/src/parse.rs | 25 +-- lychee-lib/Cargo.toml | 2 + lychee-lib/src/lib.rs | 6 +- lychee-lib/src/types/accept/mod.rs | 5 + lychee-lib/src/types/accept/range.rs | 212 ++++++++++++++++++++++ lychee-lib/src/types/accept/selector.rs | 222 ++++++++++++++++++++++++ lychee-lib/src/types/error.rs | 6 + lychee-lib/src/types/mod.rs | 2 + lychee-lib/src/types/status.rs | 8 +- lychee.example.toml | 8 +- 16 files changed, 598 insertions(+), 58 deletions(-) create mode 100644 lychee-lib/src/types/accept/mod.rs create mode 100644 lychee-lib/src/types/accept/range.rs create mode 100644 lychee-lib/src/types/accept/selector.rs diff --git a/Cargo.lock b/Cargo.lock index d4c8e09cdf..942eb111ae 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2154,6 +2154,7 @@ dependencies = [ "reqwest", "reqwest_cookie_store", "ring", + "rstest", "secrecy", "serde", "serde_json", @@ -2162,6 +2163,7 @@ dependencies = [ "tempfile", "thiserror", "tokio", + "toml", "typed-builder", "url", "wiremock", @@ -3001,6 +3003,12 @@ version = "0.7.5" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dbb5fb1acd8a1a18b3dd5be62d25485eb770e05afb408a9627d14d451bae12da" +[[package]] +name = "relative-path" +version = "1.8.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4bf2521270932c3c7bed1a59151222bd7643c79310f2916f01925e1e16255698" + [[package]] name = "reqwest" version = "0.11.20" @@ -3092,12 +3100,50 @@ dependencies = [ "winapi", ] +[[package]] +name = "rstest" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2b96577ca10cb3eade7b337eb46520108a67ca2818a24d0b63f41fd62bc9651c" +dependencies = [ + "futures", + "futures-timer", + "rstest_macros", + "rustc_version", +] + +[[package]] +name = "rstest_macros" +version = "0.18.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225e674cf31712b8bb15fdbca3ec0c1b9d825c5a24407ff2b7e005fb6a29ba03" +dependencies = [ + "cfg-if", + "glob", + "proc-macro2", + "quote", + "regex", + "relative-path", + "rustc_version", + "syn 2.0.28", + "unicode-ident", +] + [[package]] name = "rustc-demangle" version = "0.1.23" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "d626bb9dae77e28219937af045c257c28bfd3f69333c512553507f5f9798cb76" +[[package]] +name = "rustc_version" +version = "0.4.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa0f585226d2e68097d4f95d113b15b83a82e819ab25717ec0590d9584ef366" +dependencies = [ + "semver", +] + [[package]] name = "rustix" version = "0.37.23" @@ -3247,6 +3293,12 @@ dependencies = [ "libc", ] +[[package]] +name = "semver" +version = "1.0.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b0293b4b29daaf487284529cc2f5675b8e57c61f70167ba415a463651fd6a918" + [[package]] name = "serde" version = "1.0.188" diff --git a/README.md b/README.md index 5ddca47001..c4874c57be 100644 --- a/README.md +++ b/README.md @@ -20,9 +20,30 @@ Available as a command-line utility, a library and a [GitHub Action](https://git ## Table of Contents +- [Table of Contents](#table-of-contents) - [Installation](#installation) + - [Arch Linux](#arch-linux) + - [macOS](#macos) + - [Docker](#docker) + - [NixOS](#nixos) + - [FreeBSD](#freebsd) + - [Scoop](#scoop) + - [Termux](#termux) + - [Pre-built binaries](#pre-built-binaries) + - [Cargo](#cargo) + - [Build dependencies](#build-dependencies) + - [Compile and install lychee](#compile-and-install-lychee) + - [Feature flags](#feature-flags) - [Features](#features) - [Commandline usage](#commandline-usage) + - [Docker Usage](#docker-usage) + - [Linux/macOS shell command](#linuxmacos-shell-command) + - [Windows PowerShell command](#windows-powershell-command) + - [GitHub Token](#github-token) + - [Commandline Parameters](#commandline-parameters) + - [Exit codes](#exit-codes) + - [Ignoring links](#ignoring-links) + - [Caching](#caching) - [Library usage](#library-usage) - [GitHub Action Usage](#github-action-usage) - [Contributing to lychee](#contributing-to-lychee) @@ -384,7 +405,22 @@ Options: Custom request header -a, --accept - Comma-separated list of accepted status codes for valid links + A List of accepted status codes for valid links + + The following accept range syntax is supported: [start]..[=]end|code. Some valid + examples are: + + - 200..=204 + - 200..204 + - ..=204 + - ..204 + - 200 + + Use "lychee --accept '200..=204, 429, 500' ..." to provide a comma- + separated list of accepted status codes. This example will accept 200, 201, + 202, 203, 204, 429, and 500 as valid status codes. + + [default: 100..=103,200..=299,403..=403] --include-fragments Enable the checking of fragments in links diff --git a/fixtures/configs/smoketest.toml b/fixtures/configs/smoketest.toml index afa526b7fd..5cd5e3b95c 100644 --- a/fixtures/configs/smoketest.toml +++ b/fixtures/configs/smoketest.toml @@ -46,7 +46,13 @@ timeout = 20 retry_wait_time = 2 # Comma-separated list of accepted status codes for valid links. -accept = [200, 429] +# Supported values are: +# +# accept = ["200..=204", "429"] +# accept = "200..=204, 429" +# accept = ["200", "429"] +# accept = "200, 429" +accept = ["200", "429"] # Proceed for server connections considered insecure (invalid TLS). insecure = false diff --git a/lychee-bin/src/client.rs b/lychee-bin/src/client.rs index e2297c8ea5..a7f53c2db7 100644 --- a/lychee-bin/src/client.rs +++ b/lychee-bin/src/client.rs @@ -26,16 +26,13 @@ pub(crate) fn create(cfg: &Config, cookie_jar: Option<&Arc>) - cfg.scheme.clone() }; - let accepted = match cfg.accept { - Some(ref accepted) => { - let accepted: Result, _> = accepted - .iter() - .map(|code| StatusCode::from_u16(*code)) - .collect(); - Some(accepted?) - } - None => None, - }; + let accepted = cfg + .accept + .clone() + .into_set() + .iter() + .map(|value| StatusCode::from_u16(*value)) + .collect::, _>>()?; // `exclude_mail` will be removed in 1.0. Until then, we need to support it. // Therefore, we need to check if both `include_mail` and `exclude_mail` are set to `true` diff --git a/lychee-bin/src/commands/check.rs b/lychee-bin/src/commands/check.rs index 7acd88e864..92da1ad614 100644 --- a/lychee-bin/src/commands/check.rs +++ b/lychee-bin/src/commands/check.rs @@ -44,7 +44,7 @@ where let client = params.client; let cache = params.cache; - let accept = params.cfg.accept; + let accept = params.cfg.accept.into_set(); let pb = if params.cfg.no_progress || params.cfg.verbose.log_level() >= log::Level::Info { None @@ -207,7 +207,7 @@ async fn request_channel_task( max_concurrency: usize, client: Client, cache: Arc, - accept: Option>, + accept: HashSet, ) { StreamExt::for_each_concurrent( ReceiverStream::new(recv_req), @@ -230,7 +230,7 @@ async fn handle( client: &Client, cache: Arc, request: Request, - accept: Option>, + accept: HashSet, ) -> Response { let uri = request.uri.clone(); if let Some(v) = cache.get(&uri) { @@ -244,7 +244,7 @@ async fn handle( // `accepted` status codes might have changed from the previous run // and they may have an impact on the interpretation of the status // code. - Status::from_cache_status(v.value().status, accept) + Status::from_cache_status(v.value().status, &accept) }; return Response::new(uri.clone(), status, request.source); } diff --git a/lychee-bin/src/options.rs b/lychee-bin/src/options.rs index 2e40f379f0..00b97b64cb 100644 --- a/lychee-bin/src/options.rs +++ b/lychee-bin/src/options.rs @@ -1,17 +1,17 @@ use crate::archive::Archive; -use crate::parse::{parse_base, parse_statuscodes}; +use crate::parse::parse_base; use crate::verbosity::Verbosity; use anyhow::{anyhow, Context, Error, Result}; use clap::{arg, builder::TypedValueParser, Parser}; use const_format::{concatcp, formatcp}; use lychee_lib::{ - Base, BasicAuthSelector, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, + AcceptSelector, Base, BasicAuthSelector, Input, DEFAULT_MAX_REDIRECTS, DEFAULT_MAX_RETRIES, DEFAULT_RETRY_WAIT_TIME_SECS, DEFAULT_TIMEOUT_SECS, DEFAULT_USER_AGENT, }; use secrecy::{ExposeSecret, SecretString}; use serde::Deserialize; use std::path::Path; -use std::{collections::HashSet, fs, path::PathBuf, str::FromStr, time::Duration}; +use std::{fs, path::PathBuf, str::FromStr, time::Duration}; use strum::VariantNames; pub(crate) const LYCHEE_IGNORE_FILE: &str = ".lycheeignore"; @@ -91,6 +91,7 @@ default_function! { retry_wait_time: usize = DEFAULT_RETRY_WAIT_TIME_SECS; method: String = DEFAULT_METHOD.to_string(); verbosity: Verbosity = Verbosity::default(); + accept_selector: AcceptSelector = AcceptSelector::default(); } // Macro for merging configuration values @@ -304,10 +305,28 @@ pub(crate) struct Config { #[serde(default)] pub(crate) header: Vec, - /// Comma-separated list of accepted status codes for valid links - #[arg(short, long, value_parser = parse_statuscodes)] - #[serde(default)] - pub(crate) accept: Option>, + /// A List of accepted status codes for valid links + #[arg( + short, + long, + default_value_t, + long_help = "A List of accepted status codes for valid links + +The following accept range syntax is supported: [start]..[=]end|code. Some valid +examples are: + +- 200..=204 +- 200..204 +- ..=204 +- ..204 +- 200 + +Use \"lychee --accept '200..=204, 429, 500' ...\" to provide a comma- +separated list of accepted status codes. This example will accept 200, 201, +202, 203, 204, 429, and 500 as valid status codes." + )] + #[serde(default = "accept_selector")] + pub(crate) accept: AcceptSelector, /// Enable the checking of fragments in links. #[arg(long)] @@ -389,7 +408,8 @@ impl Config { pub(crate) fn load_from_file(path: &Path) -> Result { // Read configuration file let contents = fs::read_to_string(path)?; - toml::from_str(&contents).context("Failed to parse configuration file") + toml::from_str(&contents) + .map_err(|err| anyhow::anyhow!("Failed to parse configuration file: {}", err)) } /// Merge the configuration from TOML into the CLI configuration @@ -421,7 +441,6 @@ impl Config { exclude_mail: false; remap: Vec::::new(); header: Vec::::new(); - accept: None; timeout: DEFAULT_TIMEOUT_SECS; retry_wait_time: DEFAULT_RETRY_WAIT_TIME_SECS; method: DEFAULT_METHOD; diff --git a/lychee-bin/src/parse.rs b/lychee-bin/src/parse.rs index fddb7c4028..b5a4cca732 100644 --- a/lychee-bin/src/parse.rs +++ b/lychee-bin/src/parse.rs @@ -1,7 +1,7 @@ use anyhow::{anyhow, Context, Result}; use headers::{HeaderMap, HeaderName}; use lychee_lib::{remap::Remaps, Base}; -use std::{collections::HashSet, time::Duration}; +use std::time::Duration; /// Split a single HTTP header into a (key, value) tuple fn read_header(input: &str) -> Result<(String, String)> { @@ -40,24 +40,8 @@ pub(crate) fn parse_base(src: &str) -> Result { Base::try_from(src) } -/// Parse HTTP status codes into a set of `StatusCode` -/// -/// Note that this function does not convert the status codes into -/// `StatusCode` but rather into `u16` to avoid the need for -/// `http` as a dependency and to support custom status codes, which are -/// necessary for some websites, which don't adhere to the HTTP spec or IANA. -pub(crate) fn parse_statuscodes(accept: &str) -> Result> { - let mut statuscodes = HashSet::new(); - for code in accept.split(',') { - let code: u16 = code.parse::()?; - statuscodes.insert(code); - } - Ok(statuscodes) -} - #[cfg(test)] mod tests { - use std::collections::HashSet; use headers::HeaderMap; use regex::Regex; @@ -72,13 +56,6 @@ mod tests { assert_eq!(parse_headers(&["accept=text/html"]).unwrap(), custom); } - #[test] - fn test_parse_statuscodes() { - let actual = parse_statuscodes("200,204,301").unwrap(); - let expected = IntoIterator::into_iter([200, 204, 301]).collect::>(); - assert_eq!(actual, expected); - } - #[test] fn test_parse_remap() { let remaps = diff --git a/lychee-lib/Cargo.toml b/lychee-lib/Cargo.toml index 9dd8f7a310..7cae4262df 100644 --- a/lychee-lib/Cargo.toml +++ b/lychee-lib/Cargo.toml @@ -66,6 +66,8 @@ doc-comment = "0.3.3" tempfile = "3.8.0" wiremock = "0.5.19" serde_json = "1.0.105" +rstest = "0.18.1" +toml = "0.7.6" [features] diff --git a/lychee-lib/src/lib.rs b/lychee-lib/src/lib.rs index 878d3e844e..a74b899bc5 100644 --- a/lychee-lib/src/lib.rs +++ b/lychee-lib/src/lib.rs @@ -91,8 +91,8 @@ pub use crate::{ collector::Collector, filter::{Excludes, Filter, Includes}, types::{ - uri::valid::Uri, Base, BasicAuthCredentials, BasicAuthSelector, CacheStatus, CookieJar, - ErrorKind, FileType, Input, InputContent, InputSource, Request, Response, ResponseBody, - Result, Status, + uri::valid::Uri, AcceptRange, AcceptRangeError, AcceptSelector, Base, BasicAuthCredentials, + BasicAuthSelector, CacheStatus, CookieJar, ErrorKind, FileType, Input, InputContent, + InputSource, Request, Response, ResponseBody, Result, Status, }, }; diff --git a/lychee-lib/src/types/accept/mod.rs b/lychee-lib/src/types/accept/mod.rs new file mode 100644 index 0000000000..dc50b69384 --- /dev/null +++ b/lychee-lib/src/types/accept/mod.rs @@ -0,0 +1,5 @@ +mod range; +mod selector; + +pub use range::*; +pub use selector::*; diff --git a/lychee-lib/src/types/accept/range.rs b/lychee-lib/src/types/accept/range.rs new file mode 100644 index 0000000000..2f4c7b914b --- /dev/null +++ b/lychee-lib/src/types/accept/range.rs @@ -0,0 +1,212 @@ +use std::{fmt::Display, num::ParseIntError, ops::RangeInclusive, str::FromStr}; + +use once_cell::sync::Lazy; +use regex::Regex; +use thiserror::Error; + +static RANGE_PATTERN: Lazy = + Lazy::new(|| Regex::new(r"^([0-9]{3})?\.\.(=?)([0-9]{3})+$|^([0-9]{3})$").unwrap()); + +/// The [`AcceptRangeParseError`] indicates that the parsing process of an +/// [`AcceptRange`] from a string failed due to various underlying reasons. +#[derive(Debug, Error, PartialEq)] +pub enum AcceptRangeError { + /// The string input didn't contain any range pattern. + #[error("no range pattern found")] + NoRangePattern, + + /// The start or end index could not be parsed as an integer. + #[error("failed to parse str as integer")] + ParseIntError(#[from] ParseIntError), + + /// The start index is larger than the end index. + #[error("invalid range indices, only start < end supported")] + InvalidRangeIndices, +} + +/// [`AcceptRange`] specifies which HTTP status codes are accepted and +/// considered successful when checking a remote URL. +#[derive(Clone, Debug, PartialEq)] +pub struct AcceptRange(RangeInclusive); + +impl FromStr for AcceptRange { + type Err = AcceptRangeError; + + fn from_str(s: &str) -> Result { + let captures = RANGE_PATTERN + .captures(s) + .ok_or(AcceptRangeError::NoRangePattern)?; + + if let Some(value) = captures.get(4) { + let value: u16 = value.as_str().parse()?; + Self::new_from(value, value) + } else { + let start: u16 = match captures.get(1) { + Some(start) => start.as_str().parse().unwrap_or_default(), + None => 0, + }; + + let inclusive = !captures[2].is_empty(); + let end: u16 = captures[3].parse()?; + + if inclusive { + Self::new_from(start, end) + } else { + Self::new_from(start, end - 1) + } + } + } +} + +impl Display for AcceptRange { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}..={}", self.start(), self.end()) + } +} + +impl AcceptRange { + /// Creates a new [`AcceptRange`] which matches values between `start` and + /// `end` (both inclusive). + #[must_use] + pub const fn new(start: u16, end: u16) -> Self { + Self(RangeInclusive::new(start, end)) + } + + /// Creates a new [`AcceptRange`] which matches values between `start` and + /// `end` (both inclusive). It additionally validates that `start` > `end`. + /// + /// # Errors + /// + /// Returns an error if `start` > `end`. + pub const fn new_from(start: u16, end: u16) -> Result { + if start > end { + return Err(AcceptRangeError::InvalidRangeIndices); + } + + Ok(Self::new(start, end)) + } + + /// Returns the `start` value of this [`AcceptRange`]. + #[must_use] + pub const fn start(&self) -> &u16 { + self.0.start() + } + + /// Returns the `end` value of this [`AcceptRange`]. + #[must_use] + pub const fn end(&self) -> &u16 { + self.0.end() + } + + /// Returns whether this [`AcceptRange`] contains `value`. + #[must_use] + pub fn contains(&self, value: u16) -> bool { + self.0.contains(&value) + } + + /// Consumes self and returns the inner range. + #[must_use] + pub const fn inner(self) -> RangeInclusive { + self.0 + } + + pub(crate) fn update_start(&mut self, new_start: u16) -> Result<(), AcceptRangeError> { + let end = *self.end(); + + if new_start > end { + return Err(AcceptRangeError::InvalidRangeIndices); + } + + self.0 = RangeInclusive::new(new_start, end); + Ok(()) + } + + pub(crate) fn update_end(&mut self, new_end: u16) -> Result<(), AcceptRangeError> { + let start = *self.start(); + + if start > new_end { + return Err(AcceptRangeError::InvalidRangeIndices); + } + + self.0 = RangeInclusive::new(*self.start(), new_end); + Ok(()) + } + + pub(crate) fn merge(&mut self, other: &Self) -> bool { + // Merge when the end value of self overlaps with other's start + if self.end() >= other.start() && other.end() >= self.end() { + // We can ignore the result here, as it is guaranteed that + // start < new_end + let _ = self.update_end(*other.end()); + return true; + } + + // Merge when the start value of self overlaps with other's end + if self.start() <= other.end() && other.start() <= self.start() { + // We can ignore the result here, as it is guaranteed that + // start < new_end + let _ = self.update_start(*other.start()); + return true; + } + + false + } +} + +#[cfg(test)] +mod test { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("100..=200", vec![100, 150, 200], vec![250, 300])] + #[case("..=100", vec![0, 50, 100], vec![150, 200])] + #[case("100..200", vec![100, 150], vec![200, 250])] + #[case("..100", vec![0, 50], vec![100, 150])] + #[case("404", vec![404], vec![200, 304, 500])] + fn test_from_str( + #[case] input: &str, + #[case] valid_values: Vec, + #[case] invalid_values: Vec, + ) { + let range = AcceptRange::from_str(input).unwrap(); + + for valid in valid_values { + assert!(range.contains(valid)); + } + + for invalid in invalid_values { + assert!(!range.contains(invalid)); + } + } + + #[rstest] + #[case("200..=100", AcceptRangeError::InvalidRangeIndices)] + #[case("-100..=100", AcceptRangeError::NoRangePattern)] + #[case("-100..100", AcceptRangeError::NoRangePattern)] + #[case("100..=-100", AcceptRangeError::NoRangePattern)] + #[case("100..-100", AcceptRangeError::NoRangePattern)] + #[case("0..0", AcceptRangeError::NoRangePattern)] + #[case("abcd", AcceptRangeError::NoRangePattern)] + #[case("-1", AcceptRangeError::NoRangePattern)] + #[case("0", AcceptRangeError::NoRangePattern)] + fn test_from_str_invalid(#[case] input: &str, #[case] error: AcceptRangeError) { + let range = AcceptRange::from_str(input); + assert_eq!(range, Err(error)); + } + + #[rstest] + #[case("100..=200", "210..=300", "100..=200")] + #[case("100..=200", "190..=300", "100..=300")] + #[case("100..200", "200..300", "100..200")] + #[case("100..200", "190..300", "100..300")] + fn test_merge(#[case] range: &str, #[case] other: &str, #[case] result: &str) { + let mut range = AcceptRange::from_str(range).unwrap(); + let other = AcceptRange::from_str(other).unwrap(); + + let result = AcceptRange::from_str(result).unwrap(); + range.merge(&other); + + assert_eq!(result, range); + } +} diff --git a/lychee-lib/src/types/accept/selector.rs b/lychee-lib/src/types/accept/selector.rs new file mode 100644 index 0000000000..301086b099 --- /dev/null +++ b/lychee-lib/src/types/accept/selector.rs @@ -0,0 +1,222 @@ +use std::{collections::HashSet, fmt::Display, str::FromStr}; + +use serde::{de::Visitor, Deserialize}; +use thiserror::Error; + +use crate::{types::accept::AcceptRange, AcceptRangeError}; + +#[derive(Debug, Error)] +pub enum AcceptSelectorError { + #[error("invalid/empty input")] + InvalidInput, + + #[error("failed to parse accept range: {0}")] + AcceptRangeError(#[from] AcceptRangeError), +} + +/// An [`AcceptSelector`] determines if a returned HTTP status code should be +/// accepted and thus counted as a valid (not broken) link. +#[derive(Clone, Debug, PartialEq)] +pub struct AcceptSelector { + ranges: Vec, +} + +impl FromStr for AcceptSelector { + type Err = AcceptSelectorError; + + fn from_str(input: &str) -> Result { + let input = input.trim(); + + if input.is_empty() { + return Err(AcceptSelectorError::InvalidInput); + } + + let ranges = input + .split(',') + .map(|part| AcceptRange::from_str(part.trim())) + .collect::, AcceptRangeError>>()?; + + Ok(Self::new_from(ranges)) + } +} + +impl Default for AcceptSelector { + fn default() -> Self { + Self::new_from(vec![ + AcceptRange::new(100, 103), + AcceptRange::new(200, 299), + AcceptRange::new(403, 403), + ]) + } +} + +impl Display for AcceptSelector { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + let ranges: Vec<_> = self.ranges.iter().map(ToString::to_string).collect(); + write!(f, "{}", ranges.join(",")) + } +} + +impl AcceptSelector { + /// Creates a new empty [`AcceptSelector`]. + #[must_use] + pub const fn new() -> Self { + Self { ranges: Vec::new() } + } + + /// Creates a new [`AcceptSelector`] prefilled with `ranges`. + #[must_use] + pub fn new_from(ranges: Vec) -> Self { + let mut selector = Self::new(); + + for range in ranges { + selector.add_range(range); + } + + selector + } + + /// Adds a range of accepted HTTP status codes to this [`AcceptSelector`]. + /// This method merges the new and existing ranges if they overlap. + pub fn add_range(&mut self, range: AcceptRange) -> &mut Self { + // Merge with previous range if possible + if let Some(last) = self.ranges.last_mut() { + if last.merge(&range) { + return self; + } + } + + // If neither is the case, the ranges have no overlap at all. Just add + // to the list of ranges. + self.ranges.push(range); + self + } + + /// Returns whether this [`AcceptSelector`] contains `value`. + #[must_use] + pub fn contains(&self, value: u16) -> bool { + self.ranges.iter().any(|range| range.contains(value)) + } + + /// Consumes self and creates a [`HashSet`] which contains all + /// accepted status codes. + #[must_use] + pub fn into_set(self) -> HashSet { + let mut set = HashSet::new(); + + for range in self.ranges { + for value in range.inner() { + set.insert(value); + } + } + + set + } + + #[cfg(test)] + pub(crate) fn len(&self) -> usize { + self.ranges.len() + } +} + +struct AcceptSelectorVisitor; + +impl<'de> Visitor<'de> for AcceptSelectorVisitor { + type Value = AcceptSelector; + + fn expecting(&self, formatter: &mut std::fmt::Formatter) -> std::fmt::Result { + formatter.write_str("a string or a sequence of strings") + } + + fn visit_str(self, v: &str) -> Result + where + E: serde::de::Error, + { + AcceptSelector::from_str(v).map_err(serde::de::Error::custom) + } + + fn visit_seq(self, mut seq: A) -> Result + where + A: serde::de::SeqAccess<'de>, + { + let mut selector = AcceptSelector::new(); + while let Some(value) = seq.next_element::()? { + selector.add_range(AcceptRange::from_str(&value).map_err(serde::de::Error::custom)?); + } + Ok(selector) + } +} + +impl<'de> Deserialize<'de> for AcceptSelector { + fn deserialize(deserializer: D) -> Result + where + D: serde::Deserializer<'de>, + { + deserializer.deserialize_any(AcceptSelectorVisitor) + } +} + +#[cfg(test)] +mod test { + use super::*; + use rstest::rstest; + + #[rstest] + #[case("100..=150,200..=300", vec![100, 110, 150, 200, 300], vec![175, 350], 2)] + #[case("200..=300,100..=250", vec![100, 150, 200, 250, 300], vec![350], 1)] + #[case("100..=200,150..=200", vec![100, 150, 200], vec![250, 300], 1)] + #[case("100..=200,300", vec![100, 110, 200, 300], vec![250, 350], 2)] + fn test_from_str( + #[case] input: &str, + #[case] valid_values: Vec, + #[case] invalid_values: Vec, + #[case] length: usize, + ) { + let selector = AcceptSelector::from_str(input).unwrap(); + assert_eq!(selector.len(), length); + + for valid in valid_values { + assert!(selector.contains(valid)); + } + + for invalid in invalid_values { + assert!(!selector.contains(invalid)); + } + } + + #[rstest] + #[case(r"accept = ['200..204', '429']", vec![200, 203, 429], vec![204, 404], 2)] + #[case(r"accept = '200..204, 429'", vec![200, 203, 429], vec![204, 404], 2)] + #[case(r"accept = ['200', '429']", vec![200, 429], vec![404], 2)] + #[case(r"accept = '200, 429'", vec![200, 429], vec![404], 2)] + fn test_deserialize( + #[case] input: &str, + #[case] valid_values: Vec, + #[case] invalid_values: Vec, + #[case] length: usize, + ) { + #[derive(Deserialize)] + struct Config { + accept: AcceptSelector, + } + + let config: Config = toml::from_str(input).unwrap(); + assert_eq!(config.accept.len(), length); + + for valid in valid_values { + assert!(config.accept.contains(valid)); + } + + for invalid in invalid_values { + assert!(!config.accept.contains(invalid)); + } + } + + #[rstest] + #[case("100..=150,200..=300", "100..=150,200..=300")] + #[case("100..=150,300", "100..=150,300..=300")] + fn test_display(#[case] input: &str, #[case] display: &str) { + let selector = AcceptSelector::from_str(input).unwrap(); + assert_eq!(selector.to_string(), display); + } +} diff --git a/lychee-lib/src/types/error.rs b/lychee-lib/src/types/error.rs index 48575574a7..5d4d4a546f 100644 --- a/lychee-lib/src/types/error.rs +++ b/lychee-lib/src/types/error.rs @@ -6,6 +6,7 @@ use thiserror::Error; use tokio::task::JoinError; use super::InputContent; +use crate::types::AcceptSelectorError; use crate::{basic_auth::BasicAuthExtractorError, utils, Uri}; /// Kinds of status errors @@ -140,6 +141,10 @@ pub enum ErrorKind { /// Cannot load cookies #[error("Cannot load cookies")] Cookies(String), + + /// Accept selector parse error + #[error("Accept range error")] + AcceptSelectorError(#[from] AcceptSelectorError), } impl ErrorKind { @@ -285,6 +290,7 @@ impl Hash for ErrorKind { Self::TooManyRedirects(e) => e.to_string().hash(state), Self::BasicAuthExtractorError(e) => e.to_string().hash(state), Self::Cookies(e) => e.to_string().hash(state), + Self::AcceptSelectorError(e) => e.to_string().hash(state), } } } diff --git a/lychee-lib/src/types/mod.rs b/lychee-lib/src/types/mod.rs index 04df8a7f4b..6d0ee51cd7 100644 --- a/lychee-lib/src/types/mod.rs +++ b/lychee-lib/src/types/mod.rs @@ -1,5 +1,6 @@ #![allow(unreachable_pub)] +mod accept; mod base; mod basic_auth; mod cache; @@ -13,6 +14,7 @@ mod response; mod status; pub(crate) mod uri; +pub use accept::*; pub use base::Base; pub use basic_auth::{BasicAuthCredentials, BasicAuthSelector}; pub use cache::CacheStatus; diff --git a/lychee-lib/src/types/status.rs b/lychee-lib/src/types/status.rs index b9e7256e9e..7360128e88 100644 --- a/lychee-lib/src/types/status.rs +++ b/lychee-lib/src/types/status.rs @@ -104,19 +104,17 @@ impl Status { /// because they are provided by the user and can be invalid according to /// the HTTP spec and IANA, but the user might still want to accept them. #[must_use] - pub fn from_cache_status(s: CacheStatus, accepted: Option>) -> Self { + pub fn from_cache_status(s: CacheStatus, accepted: &HashSet) -> Self { match s { CacheStatus::Ok(code) => { - if matches!(s, CacheStatus::Ok(_)) - || accepted.map(|a| a.contains(&code)) == Some(true) - { + if matches!(s, CacheStatus::Ok(_)) || accepted.contains(&code) { return Self::Cached(CacheStatus::Ok(code)); }; Self::Cached(CacheStatus::Error(Some(code))) } CacheStatus::Error(code) => { if let Some(code) = code { - if accepted.map(|a| a.contains(&code)) == Some(true) { + if accepted.contains(&code) { return Self::Cached(CacheStatus::Ok(code)); }; } diff --git a/lychee.example.toml b/lychee.example.toml index fab29aca41..0e023055e2 100644 --- a/lychee.example.toml +++ b/lychee.example.toml @@ -46,7 +46,13 @@ timeout = 20 retry_wait_time = 2 # Comma-separated list of accepted status codes for valid links. -accept = [200, 429] +# Supported values are: +# +# accept = ["200..=204", "429"] +# accept = "200..=204, 429" +# accept = ["200", "429"] +# accept = "200, 429" +accept = ["200", "429"] # Proceed for server connections considered insecure (invalid TLS). insecure = false