-
-
Notifications
You must be signed in to change notification settings - Fork 1
/
robots.txt
100 lines (71 loc) · 2 KB
/
robots.txt
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
#########################################################
# Disallow crawlers and scrapers for training AI/ML/LLM #
#########################################################
# Amazonbot (Alexa)
# Used to "improve our services"
# https://developer.amazon.com/amazonbot
User-agent: Amazonbot
Disallow: /
# Anthropic (Claude)
# https://www.anthropic.com/
User-agent: anthropic-ai
Disallow: /
# Applebot (Siri)
# Crawler for all Apple products, including Siri and Spotlight Suggestions
User-agent: Applebot
Disallow: /
# Bytedance (Doubao, TikTok)
# Note, does not seem to respect this
# https://darkvisitors.com/agents/bytespider
User-agent: Bytespider
Disallow: /
User-agent: Bytedance
Disallow: /
# Common Crawl
# Used by Google and likely others
# https://commoncrawl.org/ccbot
User-agent: CCBot
Disallow: /
# Cohere
# https://cohere.com/
User-agent: cohere-ai
Disallow: /
# FacebookBot (Meta Facebook)
# https://developers.facebook.com/docs/sharing/bot
User-agent: FacebookBot
Disallow: /
# Google Extended (Gemini)
# Gemini apps, Vertex AI generative APIs, and other Google models
# https://developers.google.com/search/docs/crawling-indexing/overview-google-crawlers#google-extended
User-agent: Google-Extended
Disallow: /
# GPT (OpenAI)
# https://platform.openai.com/docs/gptbot
# https://platform.openai.com/docs/plugins/bot
User-agent: GPTBot
Disallow: /
User-agent: ChatGPT-User
Disallow: /
# ImageSift (Hive)
# Scraper to train generative-AI
# https://imagesift.com/about
User-agent: ImagesiftBot
Disallow: /
# Omgili Bot
# Commercial crawler using scraped data
# https://webz.io/blog/web-data/what-is-the-omgili-bot-and-why-is-it-crawling-your-website/
User-agent: omgili
Disallow: /
User-agent: omgilibot
Disallow: /
# Semrush
# Commercial crawler selling marketing and competitor data to be used with AI
# https://www.semrush.com/bot/
# https://www.semrush.com/apps/contentshake/
User-agent: SemrushBot
Disallow: /
# YouBot (you.com)
# AI chat
# https://about.you.com/youbot/
User-agent: YouBot
Disallow: /