-
Notifications
You must be signed in to change notification settings - Fork 2
feat: add dynamic robots.txt #1395
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
ad313b9
ff16126
1345c34
4690872
2ca1a4c
ed5dfe1
cffc389
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,3 @@ | ||
| import { getRobotsTxtContent } from "@/trustlab/lib/data"; | ||
|
|
||
| export default getRobotsTxtContent; |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,3 +1,8 @@ | ||
| export { getPageStaticPaths, getPageStaticProps } from "./local"; | ||
| export { | ||
| getPageStaticPaths, | ||
| getPageStaticProps, | ||
| getServerSideProps, | ||
| getRobotsTxtContent, | ||
| } from "./local"; | ||
|
|
||
| export default undefined; |
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -1,4 +1,5 @@ | ||||||
| import { getPageProps, getPagePaths } from "@/trustlab/lib/data/common"; | ||||||
| import { parseRobotsTxt } from "@/trustlab/lib/data/common/seo"; | ||||||
| import api from "@/trustlab/lib/payload"; | ||||||
|
|
||||||
| export async function getPageStaticPaths() { | ||||||
|
|
@@ -15,3 +16,20 @@ export async function getPageStaticProps(context) { | |||||
| revalidate: 60, | ||||||
| }; | ||||||
| } | ||||||
|
|
||||||
| export async function getServerSideProps(context) { | ||||||
| const props = await getPageProps(api, context); | ||||||
| if (!props) { | ||||||
| return { notFound: true }; | ||||||
| } | ||||||
| return { | ||||||
| props, | ||||||
| }; | ||||||
| } | ||||||
|
|
||||||
| export async function getRobotsTxtContent() { | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Since we're parsing content to
Suggested change
|
||||||
| const siteSettings = await api.findGlobal("site-settings"); | ||||||
| return parseRobotsTxt(siteSettings?.robotsTxt); | ||||||
| } | ||||||
|
|
||||||
| export default undefined; | ||||||
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
|
|
@@ -7,6 +7,26 @@ import { | |||||
| /* eslint-disable-next-line import/no-unresolved */ | ||||||
| } from "@payloadcms/plugin-seo/fields"; | ||||||
|
|
||||||
| import parseRobotsToMetadata from "@/trustlab/utils/parseRobotsTxt"; | ||||||
|
|
||||||
| const validateRobotsTxt = (value) => { | ||||||
| if (!value?.trim()) { | ||||||
| return true; | ||||||
| } | ||||||
| const result = parseRobotsToMetadata(value, { collectDiagnostics: true }); | ||||||
| if (!result.errors?.length) { | ||||||
| return true; | ||||||
| } | ||||||
| const message = result.errors | ||||||
| .map(({ line, directive, reason }) => | ||||||
| [`line ${line}`, directive ? `directive "${directive}"` : null, reason] | ||||||
| .filter(Boolean) | ||||||
| .join(" "), | ||||||
| ) | ||||||
| .join("; "); | ||||||
| return `Invalid robots.txt: ${message}`; | ||||||
| }; | ||||||
|
|
||||||
| const SeoTab = { | ||||||
| label: "SEO", | ||||||
| fields: [ | ||||||
|
|
@@ -46,6 +66,18 @@ const SeoTab = { | |||||
| }), | ||||||
| ], | ||||||
| }, | ||||||
| { | ||||||
| name: "robotsTxt", | ||||||
| label: "robots.txt content", | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
|
||||||
| type: "code", | ||||||
| defaultValue: "User-agent: *\nDisallow: /", | ||||||
| admin: { | ||||||
| language: "plaintext", | ||||||
| rows: 14, | ||||||
| description: "Paste the exact robots.txt text to serve.", | ||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Suggested change
Why paste? |
||||||
| }, | ||||||
| validate: validateRobotsTxt, | ||||||
| }, | ||||||
| ], | ||||||
| }; | ||||||
|
|
||||||
|
|
||||||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Again, from the bots: Looking at your code and the Next.js robots object interface, there's a mismatch. The Next.js type Robots = {
rules: Rule | Rule[]
sitemap?: string | string[]
host?: string
}
type Rule = {
userAgent?: string | string[]
allow?: string | string[]
disallow?: string | string[]
crawlDelay?: number
}Your parser is fairly close but has some unnecessary complexity for this use case. A few observations:
Here's a leaner version focused specifically on producing a Next.js-compatible robots object:Key improvements over your original: /**
* Parses a robots.txt string into a Next.js-compatible robots metadata object.
* @see https://nextjs.org/docs/app/api-reference/file-conventions/metadata/robots#robots-object
*
* @param {string} raw - robots.txt content
* @returns {{ rules: Rule | Rule[], sitemap?: string | string[], host?: string }}
*/
export default function parseRobotsToNextJs(raw = "") {
const lines = String(raw).replace(/\r\n?/g, "\n").split("\n");
const groups = []; // { userAgent: string[], allow: string[], disallow: string[], crawlDelay?: number }
const sitemaps = [];
let host = null;
let cur = null;
let curHasDirectives = false;
const ensureGroup = () => {
if (!cur) {
cur = { userAgent: ["*"], allow: [], disallow: [] };
groups.push(cur);
curHasDirectives = false;
}
return cur;
};
for (const rawLine of lines) {
const line = rawLine.replace(/#.*$/, "").trim();
if (!line) continue;
const sep = line.indexOf(":");
if (sep === -1) continue;
const key = line.slice(0, sep).trim().toLowerCase();
const val = line.slice(sep + 1).trim();
switch (key) {
case "sitemap":
if (val) sitemaps.push(val);
break;
case "host":
if (val) host = val;
break;
case "user-agent":
if (!cur || curHasDirectives) {
cur = { userAgent: [val || "*"], allow: [], disallow: [] };
groups.push(cur);
curHasDirectives = false;
} else {
cur.userAgent.push(val || "*");
}
break;
case "allow":
ensureGroup().allow.push(val);
curHasDirectives = true;
break;
case "disallow":
ensureGroup().disallow.push(val);
curHasDirectives = true;
break;
case "crawl-delay": {
const n = Number(val);
if (Number.isFinite(n)) ensureGroup().crawlDelay = n;
curHasDirectives = true;
break;
}
// Ignore unknown directives silently
}
}
// Collapse single-element arrays to plain values for Next.js compat
const unwrap = (arr) => (arr.length === 1 ? arr[0] : arr.length ? arr : undefined);
const rules = groups.map((g) => {
const rule = { userAgent: unwrap(g.userAgent) };
const allow = unwrap(g.allow);
const disallow = unwrap(g.disallow);
if (allow !== undefined) rule.allow = allow;
if (disallow !== undefined) rule.disallow = disallow;
if (g.crawlDelay !== undefined) rule.crawlDelay = g.crawlDelay;
return rule;
});
const result = { rules: rules.length === 1 ? rules[0] : rules };
const sm = unwrap(sitemaps);
if (sm !== undefined) result.sitemap = sm;
if (host) result.host = host;
return result;
}
If you also need diagnostics for debugging malformed files, you could add that back as an opt-in, but for a "parse robots.txt → Next.js metadata" utility, this is all you need. |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,203 @@ | ||
| const USER_AGENT_REGEX = /^user-agent$/i; | ||
| const ALLOW_REGEX = /^allow$/i; | ||
| const DISALLOW_REGEX = /^disallow$/i; | ||
| const CRAWL_DELAY_REGEX = /^crawl-delay$/i; | ||
| const CACHE_DELAY_REGEX = /^cache-delay$/i; | ||
| const VISIT_TIME_REGEX = /^visit-time$/i; | ||
| const SITEMAP_REGEX = /^sitemap$/i; | ||
| const HOST_REGEX = /^host$/i; | ||
| const CLEAN_PARAM_REGEX = /^clean-param$/i; | ||
|
|
||
| const KNOWN_DIRECTIVES = new Set([ | ||
| "user-agent", | ||
| "allow", | ||
| "disallow", | ||
| "crawl-delay", | ||
| "cache-delay", | ||
| "visit-time", | ||
| "sitemap", | ||
| "host", | ||
| "clean-param", | ||
| ]); | ||
|
|
||
| const normalizeString = (value) => | ||
| typeof value === "string" ? value : `${value ?? ""}`; | ||
|
|
||
| const appendDirectiveValue = (previous, value) => { | ||
| if (previous === undefined || previous === null || previous === "") { | ||
| return value; | ||
| } | ||
| if (Array.isArray(previous)) { | ||
| return [...previous, value]; | ||
| } | ||
| return [previous, value]; | ||
| }; | ||
|
|
||
| const appendUserAgent = (previous, value) => { | ||
| const normalized = value || "*"; | ||
| if (!previous || previous === "") { | ||
| return normalized; | ||
| } | ||
| if (Array.isArray(previous)) { | ||
| return [...previous, normalized]; | ||
| } | ||
| if (previous === normalized) { | ||
| return previous; | ||
| } | ||
| return [previous, normalized]; | ||
| }; | ||
|
|
||
| const startRule = (rules, userAgent, meta = {}) => { | ||
| const rule = { userAgent: userAgent || "*", ...meta }; | ||
| rules.push(rule); | ||
| return rule; | ||
| }; | ||
|
|
||
| const updateRule = (rule, next) => Object.assign(rule, next); | ||
|
|
||
| export default function parseRobotsToMetadata(rawContent = "", options = {}) { | ||
| const { collectDiagnostics = false } = options; | ||
| const diagnostics = collectDiagnostics ? [] : undefined; | ||
| const content = normalizeString(rawContent); | ||
| const rules = []; | ||
| const sitemap = []; | ||
| const cleanParams = []; | ||
| let host = null; | ||
| let current = null; | ||
| let currentHasDirectives = false; | ||
|
|
||
| const ensureRule = () => { | ||
| if (!current) { | ||
| current = startRule(rules, "*", { autoUserAgent: true }); | ||
| currentHasDirectives = false; | ||
| } | ||
| return current; | ||
| }; | ||
|
|
||
| content | ||
| .replace(/\r\n?/g, "\n") | ||
| .split("\n") | ||
| .forEach((rawLine, index) => { | ||
| const line = rawLine.trim(); | ||
| if (!line || line.startsWith("#")) { | ||
| return; | ||
| } | ||
|
|
||
| const separatorIndex = line.indexOf(":"); | ||
| if (separatorIndex === -1) { | ||
| if (diagnostics) { | ||
| diagnostics.push({ | ||
| line: index + 1, | ||
| directive: line, | ||
| reason: 'Missing ":" separator', | ||
| }); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| const key = line.slice(0, separatorIndex).trim(); | ||
| const keyLower = key.toLowerCase(); | ||
| const value = line.slice(separatorIndex + 1).trim(); | ||
|
|
||
| if (!KNOWN_DIRECTIVES.has(keyLower)) { | ||
| if (diagnostics) { | ||
| diagnostics.push({ | ||
| line: index + 1, | ||
| directive: key, | ||
| reason: "Unknown directive", | ||
| }); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| if (SITEMAP_REGEX.test(key)) { | ||
| if (value) { | ||
| sitemap.push(value); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| if (HOST_REGEX.test(key)) { | ||
| if (value) { | ||
| host = value; | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| if (CLEAN_PARAM_REGEX.test(key)) { | ||
| if (value) { | ||
| cleanParams.push(value); | ||
| } | ||
| return; | ||
| } | ||
|
|
||
| if (USER_AGENT_REGEX.test(key)) { | ||
| if (current && current.autoUserAgent) { | ||
| updateRule(current, { userAgent: value || "*" }); | ||
| delete current.autoUserAgent; | ||
| } else if (!current || currentHasDirectives) { | ||
| current = startRule(rules, value || "*"); | ||
| } else { | ||
| updateRule(current, { | ||
| userAgent: appendUserAgent(current.userAgent, value || "*"), | ||
| }); | ||
| } | ||
| currentHasDirectives = false; | ||
| return; | ||
| } | ||
|
|
||
| const rule = ensureRule(); | ||
|
|
||
| if (ALLOW_REGEX.test(key)) { | ||
| updateRule(rule, { allow: appendDirectiveValue(rule.allow, value) }); | ||
| currentHasDirectives = true; | ||
| return; | ||
| } | ||
|
|
||
| if (DISALLOW_REGEX.test(key)) { | ||
| updateRule(rule, { | ||
| disallow: appendDirectiveValue(rule.disallow, value), | ||
| }); | ||
| currentHasDirectives = true; | ||
| return; | ||
| } | ||
|
|
||
| if (CRAWL_DELAY_REGEX.test(key)) { | ||
| const numeric = Number(value); | ||
| updateRule(rule, { | ||
| crawlDelay: Number.isFinite(numeric) ? numeric : value, | ||
| }); | ||
| currentHasDirectives = true; | ||
| return; | ||
| } | ||
|
|
||
| if (CACHE_DELAY_REGEX.test(key)) { | ||
| const numeric = Number(value); | ||
| updateRule(rule, { | ||
| cacheDelay: Number.isFinite(numeric) ? numeric : value, | ||
| }); | ||
| currentHasDirectives = true; | ||
| return; | ||
| } | ||
|
|
||
| if (VISIT_TIME_REGEX.test(key)) { | ||
| updateRule(rule, { | ||
| visitTime: appendDirectiveValue(rule.visitTime, value), | ||
| }); | ||
| currentHasDirectives = true; | ||
| } | ||
| }); | ||
|
|
||
| const sanitizedRules = rules.map((ruleEntry) => { | ||
| const { autoUserAgent, ...cleanRule } = ruleEntry; | ||
| return cleanRule; | ||
| }); | ||
|
|
||
| return { | ||
| rules: sanitizedRules, | ||
| sitemap, | ||
| host, | ||
| cleanParams, | ||
| errors: diagnostics, | ||
| }; | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Is this being used anywhere?