fix: improve web crawler indexation blocking (#8896)

- totally deny access through noindex pages to some bots on all domain (seekportbot, SEO bots, Scrapy,...) - add more bots to the list (GPTBot, Bytespider,...)
openfoodfacts · Aug 28, 2023 · 4cf23de · 4cf23de
1 parent c8d6e44
commit 4cf23de
Show file tree

Hide file tree

Showing 9 changed files with 44 additions and 4 deletions.
diff --git a/lib/ProductOpener/Display.pm b/lib/ProductOpener/Display.pm
@@ -926,6 +926,8 @@ Set two attributes to `request_ref`:
 - `is_crawl_bot`: a flag (0 or 1) that indicates whether the request comes
   from a known web crawler (Google, Bing,...). We only use User-Agent value
   to set this flag.
+- `is_denied_crawl_bot`: a flag (0 or 1) that indicates whether the request
+  comes from a web crawler we want to deny access to.
 
 =cut
 
@@ -934,13 +936,21 @@ sub set_user_agent_request_ref_attributes ($request_ref) {
 	$request_ref->{user_agent} = $user_agent_str;
 
 	my $is_crawl_bot = 0;
+	my $is_denied_crawl_bot = 0;
 	if ($user_agent_str
-		=~ /Googlebot|Googlebot-Image|Google-InspectionTool|bingbot|Applebot|YandexBot|YandexRenderResourcesBot|DuckDuckBot|DotBot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|www\.qwant\.com|facebookexternalhit/i
+		=~ /Googlebot|Googlebot-Image|Google-InspectionTool|bingbot|Applebot|YandexBot|YandexRenderResourcesBot|DuckDuckBot|DotBot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|www\.qwant\.com|facebookexternalhit|Bytespider|GPTBot|SEOkicks-Robot|SearchmetricsBot|MJ12bot|SurveyBot|SEOdiver|wotbox|Cliqzbot|Paracrawl|Scrapy|VelenPublicWebCrawler|SemrushBot|MegaIndex\.ru|YandexMarket|Amazonbot|aiohttp|python-request/i
 		)
 	{
 		$is_crawl_bot = 1;
+		if ($user_agent_str
+			=~ /bingbot|SeekportBot|AhrefsBot|DataForSeoBot|SeznamBot|ZoomBot|MojeekBot|QRbot|Bytespider|SEOkicks-Robot|SearchmetricsBot|MJ12bot|SurveyBot|SEOdiver|wotbox|Cliqzbot|Paracrawl|Scrapy|VelenPublicWebCrawler|SemrushBot|MegaIndex\.ru|YandexMarket|Amazonbot/
+			)
+		{
+			$is_denied_crawl_bot = 1;
+		}
 	}
 	$request_ref->{is_crawl_bot} = $is_crawl_bot;
+	$request_ref->{is_denied_crawl_bot} = $is_denied_crawl_bot;
 	return;
 }
 

diff --git a/lib/ProductOpener/Routing.pm b/lib/ProductOpener/Routing.pm
@@ -596,8 +596,10 @@ Return 1 if the page should not be indexed by web crawlers based on analyzed req
 sub is_no_index_page ($request_ref) {
 	return scalar(
 		($request_ref->{is_crawl_bot} == 1) and (
+			# if is_denied_crawl_bot == 1, we don't accept any request from this bot
+			($request_ref->{is_denied_crawl_bot} == 1)
 			# All list of tags pages should be non-indexable
-			(defined $request_ref->{groupby_tagtype})
+			or (defined $request_ref->{groupby_tagtype})
 			or (
 				(
 					defined $request_ref->{tagtype} and (

diff --git a/templates/web/pages/robots/robots.tt.txt b/templates/web/pages/robots/robots.tt.txt
@@ -87,4 +87,7 @@ Disallow: /
 
 User-agent: DataForSeoBot
 Disallow: /
+
+User-agent: AhrefsBot
+Disallow: /
 [% END %]
diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-ch-it.txt b/tests/integration/expected_test_results/page_crawler/get-robots-txt-ch-it.txt
@@ -209,3 +209,6 @@ Disallow: /
 
 User-agent: DataForSeoBot
 Disallow: /
+
+User-agent: AhrefsBot
+Disallow: /
diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr-pro-platform.txt b/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr-pro-platform.txt
@@ -298,3 +298,6 @@ Disallow: /
 
 User-agent: DataForSeoBot
 Disallow: /
+
+User-agent: AhrefsBot
+Disallow: /
diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr.txt b/tests/integration/expected_test_results/page_crawler/get-robots-txt-fr.txt
@@ -298,3 +298,6 @@ Disallow: /
 
 User-agent: DataForSeoBot
 Disallow: /
+
+User-agent: AhrefsBot
+Disallow: /
diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-world-pro-platform.txt b/tests/integration/expected_test_results/page_crawler/get-robots-txt-world-pro-platform.txt
@@ -209,3 +209,6 @@ Disallow: /
 
 User-agent: DataForSeoBot
 Disallow: /
+
+User-agent: AhrefsBot
+Disallow: /
diff --git a/tests/integration/expected_test_results/page_crawler/get-robots-txt-world.txt b/tests/integration/expected_test_results/page_crawler/get-robots-txt-world.txt
@@ -209,3 +209,6 @@ Disallow: /
 
 User-agent: DataForSeoBot
 Disallow: /
+
+User-agent: AhrefsBot
+Disallow: /
diff --git a/tests/integration/page_crawler.t b/tests/integration/page_crawler.t
@@ -19,8 +19,8 @@ wait_application_ready();
 
 my $ua = new_client();
 
-my $CRAWLING_BOT_USER_AGENT
-	= 'Mozilla/5.0 AppleWebKit/537.36 (KHTML, like Gecko; compatible; bingbot/2.0; +http://www.bing.com/bingbot.htm) Chrome/';
+my $CRAWLING_BOT_USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)';
+my $DENIED_CRAWLING_BOT_USER_AGENT = 'Mozilla/5.0 (compatible; AhrefsBot/6.1; +http://ahrefs.com/robot/)';
 my $NORMAL_USER_USER_AGENT
 	= 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:109.0) Gecko/20100101 Firefox/114.0';
 
@@ -56,6 +56,16 @@ my $tests_ref = [
 		expected_type => 'html',
 		response_content_must_match => '<title>Only-Product - 100 g</title>'
 	},
+	# Denied crawling bot should not have access to any page
+	{
+		test_case => 'denied-crawler-access-product-page',
+		method => 'GET',
+		path => '/product/0200000000235/only-product',
+		headers_in => {'User-Agent' => $DENIED_CRAWLING_BOT_USER_AGENT},
+		expected_status_code => 200,
+		expected_type => 'html',
+		response_content_must_match => '<h1>NOINDEX</h1>'
+	},
 	# Crawling bot should receive a noindex page for nested facets
 	{
 		test_case => 'crawler-access-nested-facet-page',