feat: add spam detection (#33)

cernymatej · web-flow · commit 12df9fb1e8c4 · 2024-10-16T10:23:44.000+01:00
diff --git a/.env.example b/.env.example
@@ -1,2 +1,6 @@
+# The access token with permissions to the repository, where the webhook is installed,
+# as well as to the target repository, where the issues marked as spam will be transferred to
 NUXT_GITHUB_TOKEN=
 NUXT_WEBHOOK_GITHUB_SECRET_KEY=
+# The node_id of the repository, where to transfer the issues marked as spam
+NUXT_GITHUB_TARGET_REPOSITORY_NODE_ID=
diff --git a/nuxt.config.ts b/nuxt.config.ts
@@ -5,6 +5,7 @@ export default defineNuxtConfig({
   runtimeConfig: {
     github: {
       token: process.env.NUXT_GITHUB_TOKEN || '',
+      targetRepositoryNodeId: process.env.NUXT_GITHUB_TARGET_REPOSITORY_NODE_ID || '',
     },
   },
   routeRules: {
diff --git a/package.json b/package.json
@@ -15,7 +15,8 @@
     "@nuxt/eslint": "^0.6.0",
     "@nuxthub/core": "^0.7.26",
     "nuxt": "^3.13.2",
-    "nuxt-webhook-validators": "^0.1.1"
+    "nuxt-webhook-validators": "^0.1.1",
+    "zod": "^3.23.8"
   },
   "devDependencies": {
     "@nuxt/eslint-config": "^0.6.0",
diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml
diff --git a/server/api/webhook.post.ts b/server/api/webhook.post.ts
@@ -1,51 +1,104 @@
+import { z } from 'zod'
+import { isError } from 'h3'
+
 export default defineEventHandler(async (event) => {
+  const runtimeConfig = useRuntimeConfig(event)
+
   const isValidWebhook = await isValidGithubWebhook(event)
 
   if (!import.meta.dev && !isValidWebhook) {
     throw createError({ statusCode: 401, message: 'Unauthorized: webhook is not valid' })
   }
 
   // TODO: implement as a GitHub app
-  const { action, issue, repository /* installation */ } = await readBody(event)
+  const { action, issue, repository /* installation */ } = await readValidatedBody(event, githubWebhookSchema.parse)
   if (action !== 'opened') return null
 
-  const body = (issue.body || '')
-    .replace(/<!--.*?-->/g, ' ')
-    .replace(/https:\/\/stackblitz.com\/github\/nuxt\/starter/g, '')
-
-  if (body.split(' ').length > 200) return null
-
   const ai = hubAI()
 
-  const res = await ai.run('@hf/nousresearch/hermes-2-pro-mistral-7b', {
-    messages: [
-      {
-        role: 'system', content: `You are a kind, helpful open-source maintainer that answers in JSON. Here\`s the json schema you must adhere to:\n<schema>\n${JSON.stringify(responseSchema)}\n</schema>\n`,
-      },
-      { role: 'user', content: `# ${issue.title}\n\n${issue.body}` },
-    ],
-  }) as { response?: string, tool_calls?: { name: string, arguments: unknown }[] }
-  const answer = res.response?.trim() || ''
+  let analyzedIssue: z.infer<typeof analyzedIssueSchema> | null = null
 
+  // Run the AI model and parse the response
   try {
-    const value = JSON.parse(answer) as Response
-
-    const $github = useGitHubAPI(event)
-    const promises: Array<Promise<unknown>> = []
-
-    const labels = []
+    const res = await ai.run('@hf/nousresearch/hermes-2-pro-mistral-7b', {
+      messages: [
+        {
+          role: 'system',
+          content: `You are a kind, helpful open-source maintainer that answers in JSON. If the issue looks like spam (contains gibberish, nonsense, etc.), it is marked as spam. Do not mark issues as spam purely based on non-English content or bad grammar. Do not answer with anything else other than a valid JSON. Here\`s the json schema you must adhere to:\n<schema>\n${JSON.stringify(responseSchema)}\n</schema>\n`,
+        },
+        { role: 'user', content: `# ${issue.title}\n\n${getNormalizedIssueContent(issue.body || '')}` },
+      ],
+    })
+
+    const aiResponse = aiResponseSchema.parse(res)
+    if (!aiResponse.response) {
+      console.error('Missing AI response', res)
+      throw createError({
+        statusCode: 500,
+        message: 'Missing AI response',
+      })
+    }
 
-    if (value.issueType === 'bug' && value.reproductionProvided === false) {
-      labels.push('needs reproduction')
+    try {
+      analyzedIssue = analyzedIssueSchema.parse(JSON.parse(aiResponse.response.trim()))
     }
-    if (value.issueType === 'bug' && value.possibleRegression === true) {
-      labels.push('possible regression')
+    catch (e) {
+      console.error('Invalid AI response', aiResponse.response, e)
+      throw createError({
+        statusCode: 500,
+        message: 'Invalid AI response',
+      })
     }
-    if (value.nitro === true) {
-      labels.push('nitro')
+  }
+  catch (e) {
+    if (isError(e)) {
+      throw e
+    }
+
+    console.error('Unknown AI error', e)
+    throw createError({
+      statusCode: 500,
+      message: 'Unknown AI error',
+    })
+  }
+
+  const $github = useGitHubAPI(event)
+  const promises: Array<Promise<unknown>> = []
+
+  // Update the GitHub issue
+  try {
+    const labels: IssueLabel[] = []
+
+    if (analyzedIssue.issueType === IssueType.Spam) {
+      promises.push($github('graphql', {
+        baseURL: 'https://api.github.com/',
+        method: 'POST',
+        body: {
+          query: `
+            mutation {
+              transferIssue(input: { issueId: "${issue.node_id}", repositoryId: "${runtimeConfig.github.targetRepositoryNodeId}" }) {
+                issue {
+                  number
+                }
+              }
+            }
+          `,
+        },
+      }))
     }
-    if (value.issueType === 'documentation') {
-      labels.push('documentation')
+    else {
+      if (analyzedIssue.issueType === IssueType.Bug && !analyzedIssue.reproductionProvided) {
+        labels.push(IssueLabel.NeedsReproduction)
+      }
+      if (analyzedIssue.issueType === IssueType.Bug && analyzedIssue.possibleRegression) {
+        labels.push(IssueLabel.PossibleRegression)
+      }
+      if (analyzedIssue.nitro) {
+        labels.push(IssueLabel.Nitro)
+      }
+      if (analyzedIssue.issueType === IssueType.Documentation) {
+        labels.push(IssueLabel.Documentation)
+      }
     }
 
     if (labels.length > 0) {
@@ -55,16 +108,18 @@ export default defineEventHandler(async (event) => {
       }))
     }
 
-    if (value.spokenLanguage.toLowerCase() !== 'english') {
+    // Translate non-English issue titles to English
+    if (analyzedIssue.spokenLanguage !== 'en' && analyzedIssue.issueType !== IssueType.Spam) {
       await ai.run('@cf/meta/m2m100-1.2b', {
         text: issue.title,
-        source_lang: value.spokenLanguage.toLowerCase(),
+        source_lang: analyzedIssue.spokenLanguage,
         target_lang: 'english',
       }).then(({ translated_text }) => {
+        if (!translated_text || !translated_text.trim().length) return
         promises.push($github(`repos/${repository.full_name}/issues/${issue.number}`, {
           method: 'PATCH',
           body: {
-            title: translated_text,
+            title: `[${analyzedIssue?.spokenLanguage}:translated] ${translated_text}`,
           },
         }))
       }).catch(console.error)
@@ -74,20 +129,22 @@ export default defineEventHandler(async (event) => {
 
     return null
   }
-  catch (err) {
-    console.log(err)
-    console.error('Could not parse response from OpenAI', answer)
-    throw createError({ message: 'Could not parse.' })
+  catch (e) {
+    console.error('Error updating issue', e)
+    throw createError({
+      statusCode: 500,
+      message: 'Error updating issue',
+    })
   }
 })
 
 const responseSchema = {
   title: 'Issue Categorisation',
   type: 'object',
   properties: {
-    issueType: { type: 'string', enum: ['bug', 'feature', 'documentation'] },
+    issueType: { type: 'string', enum: ['bug', 'feature', 'documentation', 'spam'] },
     reproductionProvided: { type: 'boolean' },
-    spokenLanguage: { type: 'string' },
+    spokenLanguage: { type: 'string', comment: 'The language of the title in ISO 639-1 format. Do not include country codes, only language code.' },
     possibleRegression: {
       type: 'boolean',
       comment: 'If the issue is reported on upgrade to a new version of Nuxt, it is a possible regression.',
@@ -99,6 +156,54 @@ const responseSchema = {
   },
 } as const
 
+// eslint-disable-next-line @typescript-eslint/no-unused-vars
 type Response = {
   [key in keyof typeof responseSchema['properties']]: typeof responseSchema['properties'][key]['type'] extends 'string' ? 'enum' extends keyof typeof responseSchema['properties'][key] ? typeof responseSchema['properties'][key]['enum'] extends Array<infer S> ? S : string : string : typeof responseSchema['properties'][key]['type'] extends 'boolean' ? boolean : unknown
 }
+
+enum IssueLabel {
+  NeedsReproduction = 'needs reproduction',
+  PossibleRegression = 'possible regression',
+  Nitro = 'nitro',
+  Documentation = 'documentation',
+}
+
+enum IssueType {
+  Bug = 'bug',
+  Feature = 'feature',
+  Documentation = 'documentation',
+  Spam = 'spam',
+}
+
+const githubWebhookSchema = z.object({
+  action: z.enum(['opened']),
+  issue: z.object({
+    title: z.string(),
+    body: z.string().nullable(),
+    number: z.number(),
+    node_id: z.string(),
+  }),
+  repository: z.object({
+    full_name: z.string(),
+  }),
+  // TODO: implement as a GitHub app
+  installation: z.any().optional(),
+})
+
+const aiResponseSchema = z.object({
+  response: z.string().optional(),
+  tool_calls: z.array(z.object({
+    name: z.string(),
+    arguments: z.unknown(),
+  })).optional(),
+})
+
+// TODO: generate AI model schema from this?
+const analyzedIssueSchema = z.object({
+  issueType: z.nativeEnum(IssueType),
+  reproductionProvided: z.boolean().nullable().transform(v => v ?? false),
+  spokenLanguage: z.string().nullable().transform(lang => getNormalizedLanguage(lang)).describe('The language of the title in ISO 639-1 format.'),
+  possibleRegression: z.boolean().nullable().transform(v => v ?? false).describe('If the issue is reported on upgrade to a new version of Nuxt, it is a possible regression.'),
+  nitro: z.boolean().nullable().transform(v => v ?? false).describe('If the issue is reported only in relation to a single deployment provider, it is possibly a Nitro issue.'),
+})
+  .describe('Issue Categorisation')
diff --git a/server/utils/normalization.ts b/server/utils/normalization.ts
@@ -0,0 +1,57 @@
+// Feature issues
+const FEATURE_REQUEST_TITLE = '### Describe the feature'
+
+// Bug report issues
+const BUG_REPORT_REPRODUCTION_TITLE = '### Reproduction'
+const BUG_REPORT_LOGS_TITLE = '### Logs'
+
+const MAX_CONTENT_LENGTH = 5000
+
+/**
+ * Normalize the issue content by removing comments, stackblitz links, diacritics, and trimming the content.
+ * @param txt The issue content to normalize.
+ */
+export function getNormalizedIssueContent(txt: string) {
+  const text = txt
+    .replace(/<!--.*?-->/g, ' ')
+    .replace(/https:\/\/stackblitz.com\/github\/nuxt\/starter/g, '')
+    .normalize('NFD')
+    .replace(/[\u0300-\u036f]/g, '')
+    .trim()
+  const featureRequestContentStart = text.indexOf(FEATURE_REQUEST_TITLE)
+  // Trim feature requests
+  if (featureRequestContentStart !== -1) {
+    return text.slice(featureRequestContentStart, MAX_CONTENT_LENGTH).trim()
+  }
+
+  // Trim bug reports
+  const bugReportContentStart = text.indexOf(BUG_REPORT_REPRODUCTION_TITLE)
+  if (bugReportContentStart !== -1) {
+    // Exclude logs from the content, if present
+    const bugReportLogsStart = text.indexOf(BUG_REPORT_LOGS_TITLE)
+    if (bugReportLogsStart !== -1) {
+      return text.slice(bugReportContentStart, Math.min(bugReportLogsStart, MAX_CONTENT_LENGTH)).trim()
+    }
+
+    return text.slice(bugReportContentStart, MAX_CONTENT_LENGTH).trim()
+  }
+
+  return text.slice(0, MAX_CONTENT_LENGTH).trim()
+}
+
+/**
+ * Normalize the language code (ISO 639-1) to lowercase and remove the region code, if present.
+ * @param lang The language code to normalize.
+ * @returns The normalized language code or 'en' if the language code is not valid.
+ */
+export function getNormalizedLanguage(lang?: string | null) {
+  if (!lang) {
+    return 'en'
+  }
+  const language = lang.toLowerCase().split('-')[0]
+  const langRegex = /^[a-z]{2}$/
+  if (!langRegex.test(language)) {
+    return 'en'
+  }
+  return language
+}