Skip to content

Commit

Permalink
fix: improve web crawler indexation blocking (#8896)
Browse files Browse the repository at this point in the history
- totally deny access through noindex pages to some bots on all domain
(seekportbot, SEO bots, Scrapy,...)
- add more bots to the list (GPTBot, Bytespider,...)
  • Loading branch information
raphael0202 authored Aug 28, 2023
1 parent c8d6e44 commit 4cf23de
Show file tree
Hide file tree
Showing 9 changed files with 44 additions and 4 deletions.
12 changes: 11 additions & 1 deletion lib/ProductOpener/Display.pm
Original file line number Diff line number Diff line change
Expand Up @@ -926,6 +926,8 @@ Set two attributes to `request_ref`:
- `is_crawl_bot`: a flag (0 or 1) that indicates whether the request comes
from a known web crawler (Google, Bing,...). We only use User-Agent value
to set this flag.
- `is_denied_crawl_bot`: a flag (0 or 1) that indicates whether the request
comes from a web crawler we want to deny access to.
=cut

Expand All @@ -934,13 +936,21 @@ sub set_user_agent_request_ref_attributes ($request_ref) {
$request_ref->{user_agent} = $user_agent_str;

my $is_crawl_bot = 0;
my $is_denied_crawl_bot = 0;
if ($user_agent_str
=~ /Googlebot|Googlebot-Image|Google-InspectionTool|bingbot|Applebot|YandexBot|YandexRenderResourcesBot|DuckDuckBot|DotBot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|www\.qwant\.com|facebookexternalhit/i
=~ /Googlebot|Googlebot-Image|Google-InspectionTool|bingbot|Applebot|YandexBot|YandexRenderResourcesBot|DuckDuckBot|DotBot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|www\.qwant\.com|facebookexternalhit|Bytespider|GPTBot|SEOkicks-Robot|SearchmetricsBot|MJ12bot|SurveyBot|SEOdiver|wotbox|Cliqzbot|Paracrawl|Scrapy|VelenPublicWebCrawler|SemrushBot|MegaIndex\.ru|YandexMarket|Amazonbot|aiohttp|python-request/i
)
{
$is_crawl_bot = 1;
if ($user_agent_str
=~ /bingbot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|Bytespider|SEOkicks-Robot|SearchmetricsBot|MJ12bot|SurveyBot|SEOdiver|wotbox|Cliqzbot|Paracrawl|Scrapy|VelenPublicWebCrawler|SemrushBot|MegaIndex\.ru|YandexMarket|Amazonbot/
)
{
$is_denied_crawl_bot = 1;
}
}
$request_ref->{is_crawl_bot} = $is_crawl_bot;
$request_ref->{is_denied_crawl_bot} = $is_denied_crawl_bot;
return;
}

Expand Down
4 changes: 3 additions & 1 deletion lib/ProductOpener/Routing.pm
Original file line number Diff line number Diff line change
Expand Up @@ -596,8 +596,10 @@ Return 1 if the page should not be indexed by web crawlers based on analyzed req
sub is_no_index_page ($request_ref) {
return scalar(
($request_ref->{is_crawl_bot} == 1) and (
# if is_denied_crawl_bot == 1, we don't accept any request from this bot
($request_ref->{is_denied_crawl_bot} == 1)
# All list of tags pages should be non-indexable
(defined $request_ref->{groupby_tagtype})
or (defined $request_ref->{groupby_tagtype})
or (
(
defined $request_ref->{tagtype} and (
Expand Down
3 changes: 3 additions & 0 deletions templates/web/pages/robots/robots.tt.txt
Original file line number Diff line number Diff line change
Expand Up @@ -87,4 +87,7 @@ Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: AhrefsBot
Disallow: /
[% END %]
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,6 @@ Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: AhrefsBot
Disallow: /
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,6 @@ Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: AhrefsBot
Disallow: /
Original file line number Diff line number Diff line change
Expand Up @@ -298,3 +298,6 @@ Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: AhrefsBot
Disallow: /
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,6 @@ Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: AhrefsBot
Disallow: /
Original file line number Diff line number Diff line change
Expand Up @@ -209,3 +209,6 @@ Disallow: /

User-agent: DataForSeoBot
Disallow: /

User-agent: AhrefsBot
Disallow: /
14 changes: 12 additions & 2 deletions tests/integration/page_crawler.t
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@ wait_application_ready();

my $ua = new_client();

my $CRAWLING_BOT_USER_AGENT
= 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/';
my $CRAWLING_BOT_USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
my $DENIED_CRAWLING_BOT_USER_AGENT = 'Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)';
my $NORMAL_USER_USER_AGENT
= 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0';

Expand Down Expand Up @@ -56,6 +56,16 @@ my $tests_ref = [
expected_type => 'html',
response_content_must_match => '<title>Only-Product - 100 g</title>'
},
# Denied crawling bot should not have access to any page
{
test_case => 'denied-crawler-access-product-page',
method => 'GET',
path => '/product/0200000000235/only-product',
headers_in => {'User-Agent' => $DENIED_CRAWLING_BOT_USER_AGENT},
expected_status_code => 200,
expected_type => 'html',
response_content_must_match => '<h1>NOINDEX</h1>'
},
# Crawling bot should receive a noindex page for nested facets
{
test_case => 'crawler-access-nested-facet-page',
Expand Down

0 comments on commit 4cf23de

Please sign in to comment.