forked from ai-robots-txt/ai.robots.txt
-
Notifications
You must be signed in to change notification settings - Fork 0
/
robots.json
191 lines (191 loc) · 10.1 KB
/
robots.json
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
{
"Amazonbot": {
"operator": "Amazon",
"respect": "Yes",
"function": "Service improvement and enabling answers for Alexa users.",
"frequency": "No information. provided.",
"description": "Includes references to crawled website when surfacing answers via Alexa; does not clearly outline other uses."
},
"anthropic-ai": {
"operator": "[Anthropic](https:\/\/www.anthropic.com)",
"respect": "Unclear at this time.",
"function": "Scrapes data to train Anthropic's AI products.",
"frequency": "No information. provided.",
"description": "Scrapes data to train LLMs and AI products offered by Anthropic."
},
"Applebot-Extended": {
"operator": "[Apple](https:\/\/support.apple.com\/en-us\/119829#datausage)",
"respect": "Yes",
"function": "Powers features in Siri, Spotlight, Safari, Apple Intelligence, and others.",
"frequency": "Unclear at this time.",
"description": "Apple has a secondary user agent, Applebot-Extended ... [that is] used to train Apple's foundation models powering generative AI features across Apple products, including Apple Intelligence, Services, and Developer Tools."
},
"Bytespider": {
"operator": "ByteDance",
"respect": "No",
"function": "LLM training.",
"frequency": "Unclear at this time.",
"description": "Downloads data to train LLMS, including ChatGPT competitors."
},
"CCBot": {
"operator": "[Common Crawl](https:\/\/commoncrawl.org)",
"respect": "[Yes](https:\/\/commoncrawl.org\/ccbot)",
"function": "Provides crawl data for an open source repository that has been used to train LLMs.",
"frequency": "Unclear at this time.",
"description": "Sources data that is made openly available and is used to train AI models."
},
"ChatGPT-User": {
"operator": "[OpenAI](https:\/\/openai.com)",
"respect": "Yes",
"function": "Takes action based on user prompts.",
"frequency": "Only when prompted by a user.",
"description": "Used by plugins in ChatGPT to answer queries based on user input."
},
"ClaudeBot": {
"operator": "[Anthropic](https:\/\/www.anthropic.com)",
"respect": "Unclear at this time.",
"function": "Scrapes data to train Anthropic's AI products.",
"frequency": "No information. provided.",
"description": "Scrapes data to train LLMs and AI products offered by Anthropic."
},
"Claude-Web": {
"operator": "[Anthropic](https:\/\/www.anthropic.com)",
"respect": "Unclear at this time.",
"function": "Scrapes data to train Anthropic's AI products.",
"frequency": "No information. provided.",
"description": "Scrapes data to train LLMs and AI products offered by Anthropic."
},
"cohere-ai": {
"operator": "[Cohere](https:\/\/cohere.com)",
"respect": "Unclear at this time.",
"function": "Retrieves data to provide responses to user-initiated prompts.",
"frequency": "Takes action based on user prompts.",
"description": "Retrieves data based on user prompts."
},
"Diffbot": {
"operator": "[Diffbot](https:\/\/www.diffbot.com\/)",
"respect": "At the discretion of Diffbot users.",
"function": "Aggregates structured web data for monitoring and AI model training.",
"frequency": "Unclear at this time.",
"description": "Diffbot is an application used to parse web pages into structured data; this data is used for monitoring or AI model training."
},
"FacebookBot": {
"operator": "Meta\/Facebook",
"respect": "[Yes](https:\/\/developers.facebook.com\/docs\/sharing\/bot\/)",
"function": "Training language models",
"frequency": "Up to 1 page per second",
"description": "Officially used for training Meta \"speech recognition technology,\" unknown if used to train Meta AI specifically."
},
"Google-Extended": {
"operator": "Google",
"respect": "[Yes](https:\/\/developers.google.com\/search\/docs\/crawling-indexing\/overview-google-crawlers)",
"function": "LLM training.",
"frequency": "No information.",
"description": "Used to train Gemini and Vertex AI generative APIs. Does not impact a site's inclusion or ranking in Google Search."
},
"GoogleOther": {
"operator": "Google",
"respect": "[Yes](https:\/\/developers.google.com\/search\/docs\/crawling-indexing\/overview-google-crawlers)",
"function": "Scrapes data.",
"frequency": "No information.",
"description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\""
},
"GoogleOther-Image": {
"operator": "Google",
"respect": "[Yes](https:\/\/developers.google.com\/search\/docs\/crawling-indexing\/overview-google-crawlers)",
"function": "Scrapes data.",
"frequency": "No information.",
"description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\""
},
"GoogleOther-Video": {
"operator": "Google",
"respect": "[Yes](https:\/\/developers.google.com\/search\/docs\/crawling-indexing\/overview-google-crawlers)",
"function": "Scrapes data.",
"frequency": "No information.",
"description": "\"Used by various product teams for fetching publicly accessible content from sites. For example, it may be used for one-off crawls for internal research and development.\""
},
"GPTBot": {
"operator": "[OpenAI](https:\/\/openai.com)",
"respect": "Yes",
"function": "Scrapes data to train OpenAI's products.",
"frequency": "No information.",
"description": "Data is used to train current and future models, removed paywalled data, PII and data that violates the company's policies."
},
"img2dataset": {
"operator": "[img2dataset](https:\/\/github.com\/rom1504\/img2dataset)",
"respect": "Unclear at this time.",
"function": "Scrapes images for use in LLMs.",
"frequency": "At the discretion of img2dataset users.",
"description": "Downloads large sets of images into datasets for LLM training or other purposes."
},
"Meta-ExternalAgent": {
"operator": "[Meta](https:\/\/developers.facebook.com\/docs\/sharing\/webmasters\/web-crawlers)",
"respect": "Yes.",
"function": "Used to train models and improve products.",
"frequency": "No information.",
"description": "\"The Meta-ExternalAgent crawler crawls the web for use cases such as training AI models or improving products by indexing content directly.\""
},
"OAI-SearchBot": {
"operator": "[OpenAI](https:\/\/openai.com)",
"respect": "[Yes](https:\/\/platform.openai.com\/docs\/bots)",
"function": "Search result generation.",
"frequency": "No information.",
"description": "Crawls sites to surface as results in SearchGPT."
},
"omgili": {
"operator": "[Webz.io](https:\/\/webz.io\/)",
"respect": "[Yes](https:\/\/webz.io\/blog\/web-data\/what-is-the-omgili-bot-and-why-is-it-crawling-your-website\/)",
"function": "Data is sold.",
"frequency": "No information.",
"description": "Crawls sites for APIs used by Hootsuite, Sprinklr, NetBase, and other companies. Data also sold for research purposes or LLM training."
},
"omgilibot": {
"operator": "[Webz.io](https:\/\/webz.io\/)",
"respect": "[Yes](https:\/\/web.archive.org\/web\/20170704003301\/http:\/\/omgili.com\/Crawler.html)",
"function": "Data is sold.",
"frequency": "No information.",
"description": "Legacy user agent initially used for Omgili search engine. Unknown if still used, `omgili` agent still used by Webz.io."
},
"PerplexityBot": {
"operator": "[Perplexity](https:\/\/www.perplexity.ai\/)",
"respect": "[No](https:\/\/www.macstories.net\/stories\/wired-confirms-perplexity-is-bypassing-efforts-by-websites-to-block-its-web-crawler\/)",
"function": "Used to answer queries at the request of users.",
"frequency": "Takes action based on user prompts.",
"description": "Operated by Perplexity to obtain results in response to user queries."
},
"Scrapy": {
"operator": "[Zyte](https:\/\/www.zyte.com)",
"respect": "Unclear at this time.",
"function": "Scrapes data a variety of uses including training AI.",
"frequency": "No information.",
"description": "\"AI and machine learning applications often need large amounts of quality data, and web data extraction is a fast, efficient way to build structured data sets.\""
},
"Timpibot": {
"operator": "[Timpi](https:\/\/timpi.io)",
"respect": "Unclear at this time.",
"function": "Scrapes data for use in training LLMs.",
"frequency": "No information.",
"description": "Makes data available for training AI models."
},
"VelenPublicWebCrawler": {
"operator": "[Velen Crawler](https:\/\/velen.io)",
"respect": "[Yes](https:\/\/velen.io)",
"function": "Scrapes data for business data sets and machine learning models.",
"frequency": "No information.",
"description": "\"Our goal with this crawler is to build business datasets and machine learning models to better understand the web.\""
},
"YouBot": {
"operator": "[You](https:\/\/about.you.com\/youchat\/)",
"respect": "[Yes](https:\/\/about.you.com\/youbot\/)",
"function": "Scrapes data for search engine and LLMs.",
"frequency": "No information.",
"description": "Retrieves data used for You.com web search engine and LLMs."
},
"TestBot2": {
"operator": "Testing operator",
"respect": "Testing respect",
"function": "Testing function",
"frequency": "Testing frequency",
"description": "Testing description"
}
}