Skip to content

Commit 172fa8b

Browse files
new: Warn about trackers in URLs
2 parents c41afda + 0cbc992 commit 172fa8b

File tree

14 files changed

+3158
-54
lines changed

14 files changed

+3158
-54
lines changed

lib/SpiderBits/src/ClearUrls.php

Lines changed: 201 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,201 @@
1+
<?php
2+
3+
namespace SpiderBits;
4+
5+
/**
6+
* @author Marien Fressinaud <dev@marienfressinaud.fr>
7+
* @license http://www.gnu.org/licenses/agpl-3.0.en.html AGPL
8+
*/
9+
class ClearUrls
10+
{
11+
private static $clear_urls_data = null;
12+
13+
/**
14+
* Clear a URL from its tracker parameters.
15+
*
16+
* It uses ClearURLs rules internally to clear the URL. Some behaviours
17+
* differ from ClearURLs:
18+
*
19+
* - if a URL matches a rule with `completeProvider=true`, an empty string
20+
* is returned;
21+
* - referralMarketing parameters are always removed (i.e. there's no
22+
* option to allow referral marketing);
23+
* - `forceRedirection` is ignored.
24+
*
25+
* @see https://docs.clearurls.xyz/1.23.0/specs/rules/
26+
*
27+
* @param string $url
28+
*
29+
* @throws \Exception
30+
* Raised if the clearurls-data.minify.json file cannot be read, or
31+
* cannot be parsed to JSON.
32+
*
33+
* @return string
34+
*/
35+
public static function clear($url)
36+
{
37+
// A note about the regex used in this method: PCRE patterns must be
38+
// enclosed by delimiters. They are generally "/", "#" or "~". Problem:
39+
// these characters are often used in URLs, and so they can be present
40+
// in the patterns (in which case the `preg_*` functions may fail). It
41+
// is why I use "@" instead which has very few chances to be used in
42+
// the patterns.
43+
44+
$providers = self::loadClearUrlsProviders();
45+
foreach ($providers as $provider_name => $provider) {
46+
// set default values so we don't have to check for their presence
47+
$provider = array_merge([
48+
'urlPattern' => '',
49+
'completeProvider' => false,
50+
'rules' => [],
51+
'rawRules' => [],
52+
'referralMarketing' => [],
53+
'exceptions' => [],
54+
'redirections' => [],
55+
'forceRedirection' => false,
56+
], $provider);
57+
58+
// First, verify our URL matches the urlPattern (if not, skip it).
59+
if (!preg_match("@{$provider['urlPattern']}@i", $url)) {
60+
continue;
61+
}
62+
63+
// Secondly, verify the URL is not in the exceptions list (if it
64+
// is, skip it).
65+
$is_exception = false;
66+
foreach ($provider['exceptions'] ?? [] as $exception_pattern) {
67+
if (preg_match("@{$exception_pattern}@i", $url)) {
68+
$is_exception = true;
69+
break;
70+
}
71+
}
72+
73+
if ($is_exception) {
74+
continue;
75+
}
76+
77+
// If the provider is "completeProvider", the URL should be blocked
78+
// (i.e. an empty string in flusio context).
79+
if (isset($provider['completeProvider']) && $provider['completeProvider']) {
80+
return '';
81+
}
82+
83+
// Extract redirections from the URL if any (e.g.
84+
// https://google.com/url?q=https://example.com)
85+
// If we find a redirection, we call clear() recursively (but
86+
// the current call ends here)
87+
foreach ($provider['redirections'] ?? [] as $redirection_pattern) {
88+
$result = preg_match("@{$redirection_pattern}@i", $url, $matches);
89+
if ($result && count($matches) >= 2) {
90+
// the redirected URL is in the first Regex group (index 0
91+
// is the full matching string).
92+
$redirected_url = rawurldecode($matches[1]);
93+
$redirected_url = Url::sanitize($redirected_url);
94+
return self::clear($redirected_url);
95+
}
96+
}
97+
98+
// Directly remove matching rawRules from the URL
99+
foreach ($provider['rawRules'] ?? [] as $raw_rule_pattern) {
100+
$url = preg_replace("@{$raw_rule_pattern}@i", '', $url);
101+
}
102+
103+
// Apply rules and referralMarketing rules to query parameters.
104+
// Since trackers can also be inserted in the URL fragment, we
105+
// clear it as well.
106+
$rules = array_merge(
107+
$provider['rules'] ?? [],
108+
$provider['referralMarketing'] ?? []
109+
);
110+
111+
$parsed_url = parse_url($url);
112+
$parsed_url['query'] = $parsed_url['query'] ?? '';
113+
$parsed_url['fragment'] = $parsed_url['fragment'] ?? '';
114+
115+
$cleared_query = self::clearQuery($parsed_url['query'], $rules);
116+
$cleared_fragment = self::clearQuery($parsed_url['fragment'], $rules);
117+
118+
// Finally, rebuild the URL from the parsed and cleared parts
119+
$rebuilt_url = $parsed_url['scheme'] . '://';
120+
$rebuilt_url .= $parsed_url['host'];
121+
if (!empty($parsed_url['port'])) {
122+
$rebuilt_url .= ':' . $parsed_url['port'];
123+
}
124+
$rebuilt_url .= $parsed_url['path'] ?? '';
125+
126+
$had_empty_query = strpos($url, '?') !== false && $parsed_url['query'] === '';
127+
if (!empty($cleared_query) || $had_empty_query) {
128+
$rebuilt_url .= '?' . $cleared_query;
129+
}
130+
131+
$had_empty_fragment = strpos($url, '#') !== false && $parsed_url['fragment'] === '';
132+
if (!empty($cleared_fragment) || $had_empty_fragment) {
133+
$rebuilt_url .= '#' . $cleared_fragment;
134+
}
135+
136+
$url = $rebuilt_url;
137+
}
138+
139+
return $url;
140+
}
141+
142+
/**
143+
* Load and return the ClearURLs providers rules from the file.
144+
*
145+
* The file is only loaded once, even if you call this method multiple
146+
* times.
147+
*
148+
* @throws \Exception
149+
* Raised if the clearurls-data.minify.json file cannot be read, or
150+
* cannot be parsed to JSON.
151+
*
152+
* @return array
153+
*/
154+
private static function loadClearUrlsProviders()
155+
{
156+
if (self::$clear_urls_data === null) {
157+
$clear_urls_file_content = file_get_contents(__DIR__ . '/clearurls-data.min.json');
158+
if ($clear_urls_file_content === false) {
159+
throw new \Exception(
160+
__DIR__ . '/clearurls-data.min.json file cannot be found.'
161+
);
162+
}
163+
164+
self::$clear_urls_data = json_decode($clear_urls_file_content, true);
165+
if (self::$clear_urls_data === null) {
166+
throw new \Exception(
167+
__DIR__ . '/clearurls-data.min.json file does not contain valid JSON.'
168+
);
169+
}
170+
}
171+
172+
return self::$clear_urls_data['providers'];
173+
}
174+
175+
/**
176+
* Remove parameters from a URL query.
177+
*
178+
* Parameters are removed from the string if their names match any of the
179+
* provided rules patterns.
180+
*
181+
* @param string $query
182+
* @param string[] $rules
183+
*
184+
* @return string
185+
*/
186+
private static function clearQuery($query, $rules)
187+
{
188+
$parameters = Url::parseQuery($query);
189+
190+
foreach ($parameters as $name => $value) {
191+
foreach ($rules as $rule_pattern) {
192+
if (preg_match("@{$rule_pattern}@i", $name)) {
193+
unset($parameters[$name]);
194+
break;
195+
}
196+
}
197+
}
198+
199+
return Url::buildQuery($parameters);
200+
}
201+
}

0 commit comments

Comments
 (0)