github · docs-bot · Jun 4, 2024 · Jun 4, 2024
diff --git a/package-lock.json b/package-lock.json
diff --git a/package.json b/package.json
@@ -197,6 +197,7 @@
   "dependencies": {
     "@elastic/elasticsearch": "8.13.1",
     "@github/failbot": "0.8.3",
+    "@horizon-rs/language-guesser": "0.1.1",
     "@octokit/plugin-retry": "6.0.1",
     "@octokit/request-error": "6.1.1",
     "@primer/behaviors": "^1.5.1",
@@ -216,6 +217,7 @@
     "connect-datadog": "0.0.9",
     "connect-timeout": "1.9.0",
     "cookie-parser": "^1.4.6",
+    "cuss": "2.2.0",
     "dayjs": "^1.11.3",
     "dotenv": "^16.4.5",
     "escape-string-regexp": "5.0.0",
@@ -313,6 +315,7 @@
     "commander": "^12.1.0",
     "cross-env": "^7.0.3",
     "csp-parse": "0.0.2",
+    "csv-parse": "5.5.6",
     "eslint": "8.57.0",
     "eslint-config-prettier": "9.1.0",
     "eslint-config-standard": "17.1.0",

diff --git a/src/events/analyze-comment.js b/src/events/analyze-comment.js
@@ -0,0 +1,158 @@
+import { cuss } from 'cuss'
+import { cuss as cussPt } from 'cuss/pt'
+import { cuss as cussFr } from 'cuss/fr'
+import { cuss as cussEs } from 'cuss/es'
+import { Language } from '@horizon-rs/language-guesser'
+
+const language = new Language()
+
+// Exported for the debugging CLI script
+export const SIGNAL_RATINGS = [
+  {
+    reduction: 1.0,
+    name: 'email-only',
+    validator: (comment) => isEmailOnly(comment),
+  },
+  {
+    reduction: 0.2,
+    name: 'contains-email',
+    validator: (comment) => isContainingEmail(comment),
+  },
+  {
+    reduction: 0.1,
+    name: 'url-only',
+    validator: (comment) => isURL(comment),
+  },
+  {
+    reduction: 0.1,
+    name: 'numbers-only',
+    validator: (comment) => isNumbersOnly(comment),
+  },
+  {
+    reduction: 0.1,
+    name: 'all-uppercase',
+    validator: (comment) => isAllUppercase(comment),
+  },
+  {
+    reduction: 0.1,
+    name: 'too-short',
+    validator: (comment) => isTooShort(comment),
+  },
+  {
+    reduction: 0.2,
+    name: 'not-language',
+    validator: (comment, language) => isNotLanguage(comment, language),
+  },
+  {
+    reduction: 0.3,
+    name: 'cuss-words-likely',
+    validator: (comment, language) => isLikelyCussWords(comment, language),
+  },
+  {
+    reduction: 0.1,
+    name: 'cuss-words-maybe',
+    validator: (comment, language) => isMaybeCussWords(comment, language),
+  },
+]
+
+export async function analyzeComment(text, language = 'en') {
+  const signals = []
+  let rating = 1.0
+  for (const { reduction, name, validator } of SIGNAL_RATINGS) {
+    if (validator(text, language)) {
+      signals.push(name)
+      rating -= reduction
+    }
+    if (rating <= 0) break
+  }
+
+  return { signals, rating }
+}
+
+function isEmailOnly(text) {
+  if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
+    const atSigns = text.split('@').length
+    if (atSigns === 2) {
+      return true
+    }
+  }
+}
+
+function isContainingEmail(text) {
+  if (text.includes('@') && !isEmailOnly(text)) {
+    // Don't use splitWords() here because `foo@example.com` will be
+    // split up into ['foo', 'example.com'].
+    return text.split(/\s+/g).some((word) => isEmailOnly(word))
+  }
+  return false
+}
+
+function isURL(text) {
+  if (!text.trim().includes(' ')) {
+    if (URL.canParse(text.trim())) return true
+  }
+}
+
+function isNumbersOnly(text) {
+  return /^\d+$/.test(text.replace(/\s/g, ''))
+}
+
+function isAllUppercase(text) {
+  return /[A-Z]/.test(text) && text === text.toUpperCase()
+}
+
+function isTooShort(text) {
+  const split = text.trim().split(/\s+/)
+  if (split.length <= 1) {
+    // return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
+    return true
+  }
+}
+
+function isNotLanguage(text, language_) {
+  const bestGuess = language.guessBest(text.trim())
+  if (!bestGuess) return true // Can happen if the text is just whitespace
+  // @horizon-rs/language-guesser is based on tri-grams and can lead
+  // to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
+  // Haitian! And that 'I wanne robux 1000' is Polish!
+  // But that's because they are short and there's not enough clues to
+  // guess what language it is. You and I might know those are actually
+  // attempts to be English, despite the spelling.
+  // But are they useful comments? Given that this is just a signal,
+  // and not a hard blocker, it's more of a clue than a fact.
+  return bestGuess.alpha2 !== language_
+}
+
+function getCussWords(lang) {
+  switch (lang) {
+    case 'pt':
+      return cussPt
+    case 'fr':
+      return cussFr
+    case 'es':
+      return cussEs
+    default:
+      return cuss
+  }
+}
+
+function isLikelyCussWords(text, language_, rating = 2) {
+  const cussWords = getCussWords(language_)
+  for (const word of splitWords(text, language_ || 'en')) {
+    if (cussWords[word] && cussWords[word] === rating) {
+      return true
+    }
+  }
+  return false
+}
+
+function isMaybeCussWords(text, language_) {
+  return isLikelyCussWords(text, language_, 1)
+}
+
+const segmenter = new Intl.Segmenter([], { granularity: 'word' })
+
+function splitWords(text) {
+  const segmentedText = segmenter.segment(text)
+  return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
+}
diff --git a/src/events/middleware.js b/src/events/middleware.js
@@ -8,6 +8,7 @@ import { noCacheControl } from '#src/frame/middleware/cache-control.js'
 import { getJsonValidator } from '#src/tests/lib/validate-json-schema.js'
 import { formatErrors } from './lib/middleware-errors.js'
 import { publish as _publish } from './lib/hydro.js'
+import { analyzeComment } from './analyze-comment.js'
 
 const router = express.Router()
 const OMIT_FIELDS = ['type']
@@ -90,18 +91,9 @@ router.post(
       return res.status(400).json({ message: 'Empty comment' })
     }
 
-    const signals = []
-    const rating = 1.0
+    const { rating } = await analyzeComment(comment, locale)
 
-    // if (comment.includes('@') && !comment.includes(' ')) {
-    //   // XXX Make it a simple email validator
-    //   signals.push({
-    //     email: 'Looks like an email address',
-    //   })
-    //   rating -= 0.1
-    // }
-
-    return res.json({ rating, signals })
+    return res.json({ rating })
   }),
 )
 

diff --git a/src/events/scripts/analyze-comments-csv.ts b/src/events/scripts/analyze-comments-csv.ts
@@ -0,0 +1,100 @@
+/**
+ * This script is used to analyze posted survey comments in a CSV file.
+ * The CSV file is expected to have come from the Azure Data Explorer
+ * after having queries the `docs_v0_survey_event` table.
+ *
+ *
+ */
+
+import fs from 'node:fs'
+import util from 'node:util'
+
+import chalk from 'chalk'
+import { parse } from 'csv-parse'
+import { program } from 'commander'
+
+import { SIGNAL_RATINGS } from '../analyze-comment'
+
+type Options = {
+  outputFile: string
+  limit: string
+  random: boolean
+}
+program
+  .description('Analyze survey comments in a CSV file')
+  .option('-o, --output-file <path>', 'path to the output', 'stdout')
+  .option('--limit <number>', 'limit number of records analyzed', 'Infinity')
+  .option(
+    '--random',
+    'randomize the lines analyzed (useful when limit is less than size of CSV)',
+    false,
+  )
+  .argument('<csv-files...>', 'path to the exported CSV file')
+  .action(main)
+
+program.parse(process.argv)
+
+async function main(csvFile: string[], options: Options) {
+  for (const file of csvFile) {
+    await analyzeFile(file, options)
+  }
+}
+
+type Record = {
+  [key: string]: string | number
+}
+
+async function analyzeFile(csvFile: string, options: Options) {
+  const parser = fs.createReadStream(csvFile).pipe(
+    parse({
+      // Needed when parsing CSVs from the Azure Data Explorer
+      bom: true,
+    }),
+  )
+  let headers: null | string[] = null
+  const records: Record[] = []
+  for await (const record of parser) {
+    if (headers === null) {
+      headers = record as string[]
+    } else {
+      const obj: {
+        [key: string]: string
+      } = {}
+      for (let i = 0; i < headers.length; i++) {
+        obj[headers[i]] = record[i]
+      }
+      records.push(obj)
+    }
+  }
+
+  const limit = parseInt(options.limit)
+  if (options.random) {
+    records.sort(() => Math.random() - 0.5)
+  }
+  for (const record of records.slice(0, limit)) {
+    const language = record.survey_comment_language || 'en'
+    let rating = 1.0
+    let first = true
+    for (const { reduction, name, validator } of SIGNAL_RATINGS) {
+      const hit = validator(record.survey_comment, language)
+      if (hit) {
+        rating -= reduction
+        if (first) {
+          console.log(util.inspect(record.survey_comment))
+          first = false
+        }
+        console.log(name.padEnd(10), reduction)
+        if (rating <= 0.0) {
+          break
+        }
+      }
+    }
+    if (rating !== 1.0) {
+      console.log(chalk.yellow(`Rating: ${rating}`))
+    } else {
+      console.log(chalk.green('No rating reduction'))
+    }
+
+    console.log('\n')
+  }
+}