Skip to content

Repo sync #33337

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Jun 4, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

3 changes: 3 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -197,6 +197,7 @@
"dependencies": {
"@elastic/elasticsearch": "8.13.1",
"@github/failbot": "0.8.3",
"@horizon-rs/language-guesser": "0.1.1",
"@octokit/plugin-retry": "6.0.1",
"@octokit/request-error": "6.1.1",
"@primer/behaviors": "^1.5.1",
Expand All @@ -216,6 +217,7 @@
"connect-datadog": "0.0.9",
"connect-timeout": "1.9.0",
"cookie-parser": "^1.4.6",
"cuss": "2.2.0",
"dayjs": "^1.11.3",
"dotenv": "^16.4.5",
"escape-string-regexp": "5.0.0",
Expand Down Expand Up @@ -313,6 +315,7 @@
"commander": "^12.1.0",
"cross-env": "^7.0.3",
"csp-parse": "0.0.2",
"csv-parse": "5.5.6",
"eslint": "8.57.0",
"eslint-config-prettier": "9.1.0",
"eslint-config-standard": "17.1.0",
Expand Down
158 changes: 158 additions & 0 deletions src/events/analyze-comment.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,158 @@
import { cuss } from 'cuss'
import { cuss as cussPt } from 'cuss/pt'
import { cuss as cussFr } from 'cuss/fr'
import { cuss as cussEs } from 'cuss/es'
import { Language } from '@horizon-rs/language-guesser'

const language = new Language()

// Exported for the debugging CLI script
export const SIGNAL_RATINGS = [
{
reduction: 1.0,
name: 'email-only',
validator: (comment) => isEmailOnly(comment),
},
{
reduction: 0.2,
name: 'contains-email',
validator: (comment) => isContainingEmail(comment),
},
{
reduction: 0.1,
name: 'url-only',
validator: (comment) => isURL(comment),
},
{
reduction: 0.1,
name: 'numbers-only',
validator: (comment) => isNumbersOnly(comment),
},
{
reduction: 0.1,
name: 'all-uppercase',
validator: (comment) => isAllUppercase(comment),
},
{
reduction: 0.1,
name: 'too-short',
validator: (comment) => isTooShort(comment),
},
{
reduction: 0.2,
name: 'not-language',
validator: (comment, language) => isNotLanguage(comment, language),
},
{
reduction: 0.3,
name: 'cuss-words-likely',
validator: (comment, language) => isLikelyCussWords(comment, language),
},
{
reduction: 0.1,
name: 'cuss-words-maybe',
validator: (comment, language) => isMaybeCussWords(comment, language),
},
]

export async function analyzeComment(text, language = 'en') {
const signals = []
let rating = 1.0
for (const { reduction, name, validator } of SIGNAL_RATINGS) {
if (validator(text, language)) {
signals.push(name)
rating -= reduction
}
if (rating <= 0) break
}

return { signals, rating }
}

function isEmailOnly(text) {
if (text.includes('@') && !/\s/.test(text.trim()) && !text.includes('://')) {
const atSigns = text.split('@').length
if (atSigns === 2) {
return true
}
}
}

function isContainingEmail(text) {
if (text.includes('@') && !isEmailOnly(text)) {
// Don't use splitWords() here because `foo@example.com` will be
// split up into ['foo', 'example.com'].
return text.split(/\s+/g).some((word) => isEmailOnly(word))
}
return false
}

function isURL(text) {
if (!text.trim().includes(' ')) {
if (URL.canParse(text.trim())) return true
}
}

function isNumbersOnly(text) {
return /^\d+$/.test(text.replace(/\s/g, ''))
}

function isAllUppercase(text) {
return /[A-Z]/.test(text) && text === text.toUpperCase()
}

function isTooShort(text) {
const split = text.trim().split(/\s+/)
if (split.length <= 1) {
// return !isNumbersOnly(text) && !isURL(text) && !isEmailOnly(text) && !isAllUppercase(text)
return true
}
}

function isNotLanguage(text, language_) {
const bestGuess = language.guessBest(text.trim())
if (!bestGuess) return true // Can happen if the text is just whitespace
// @horizon-rs/language-guesser is based on tri-grams and can lead
// to false positives. For example, it thinks that 'Thamk you ❤️🙏' is
// Haitian! And that 'I wanne robux 1000' is Polish!
// But that's because they are short and there's not enough clues to
// guess what language it is. You and I might know those are actually
// attempts to be English, despite the spelling.
// But are they useful comments? Given that this is just a signal,
// and not a hard blocker, it's more of a clue than a fact.
return bestGuess.alpha2 !== language_
}

function getCussWords(lang) {
switch (lang) {
case 'pt':
return cussPt
case 'fr':
return cussFr
case 'es':
return cussEs
default:
return cuss
}
}

function isLikelyCussWords(text, language_, rating = 2) {
const cussWords = getCussWords(language_)
for (const word of splitWords(text, language_ || 'en')) {
if (cussWords[word] && cussWords[word] === rating) {
return true
}
}
return false
}

function isMaybeCussWords(text, language_) {
return isLikelyCussWords(text, language_, 1)
}

const segmenter = new Intl.Segmenter([], { granularity: 'word' })

function splitWords(text) {
const segmentedText = segmenter.segment(text)
return [...segmentedText].filter((s) => s.isWordLike).map((s) => s.segment)
}
14 changes: 3 additions & 11 deletions src/events/middleware.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ import { noCacheControl } from '#src/frame/middleware/cache-control.js'
import { getJsonValidator } from '#src/tests/lib/validate-json-schema.js'
import { formatErrors } from './lib/middleware-errors.js'
import { publish as _publish } from './lib/hydro.js'
import { analyzeComment } from './analyze-comment.js'

const router = express.Router()
const OMIT_FIELDS = ['type']
Expand Down Expand Up @@ -90,18 +91,9 @@ router.post(
return res.status(400).json({ message: 'Empty comment' })
}

const signals = []
const rating = 1.0
const { rating } = await analyzeComment(comment, locale)

// if (comment.includes('@') && !comment.includes(' ')) {
// // XXX Make it a simple email validator
// signals.push({
// email: 'Looks like an email address',
// })
// rating -= 0.1
// }

return res.json({ rating, signals })
return res.json({ rating })
}),
)

Expand Down
100 changes: 100 additions & 0 deletions src/events/scripts/analyze-comments-csv.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/**
* This script is used to analyze posted survey comments in a CSV file.
* The CSV file is expected to have come from the Azure Data Explorer
* after having queries the `docs_v0_survey_event` table.
*
*
*/

import fs from 'node:fs'
import util from 'node:util'

import chalk from 'chalk'
import { parse } from 'csv-parse'
import { program } from 'commander'

import { SIGNAL_RATINGS } from '../analyze-comment'

type Options = {
outputFile: string
limit: string
random: boolean
}
program
.description('Analyze survey comments in a CSV file')
.option('-o, --output-file <path>', 'path to the output', 'stdout')
.option('--limit <number>', 'limit number of records analyzed', 'Infinity')
.option(
'--random',
'randomize the lines analyzed (useful when limit is less than size of CSV)',
false,
)
.argument('<csv-files...>', 'path to the exported CSV file')
.action(main)

program.parse(process.argv)

async function main(csvFile: string[], options: Options) {
for (const file of csvFile) {
await analyzeFile(file, options)
}
}

type Record = {
[key: string]: string | number
}

async function analyzeFile(csvFile: string, options: Options) {
const parser = fs.createReadStream(csvFile).pipe(
parse({
// Needed when parsing CSVs from the Azure Data Explorer
bom: true,
}),
)
let headers: null | string[] = null
const records: Record[] = []
for await (const record of parser) {
if (headers === null) {
headers = record as string[]
} else {
const obj: {
[key: string]: string
} = {}
for (let i = 0; i < headers.length; i++) {
obj[headers[i]] = record[i]
}
records.push(obj)
}
}

const limit = parseInt(options.limit)
if (options.random) {
records.sort(() => Math.random() - 0.5)
}
for (const record of records.slice(0, limit)) {
const language = record.survey_comment_language || 'en'
let rating = 1.0
let first = true
for (const { reduction, name, validator } of SIGNAL_RATINGS) {
const hit = validator(record.survey_comment, language)
if (hit) {
rating -= reduction
if (first) {
console.log(util.inspect(record.survey_comment))
first = false
}
console.log(name.padEnd(10), reduction)
if (rating <= 0.0) {
break
}
}
}
if (rating !== 1.0) {
console.log(chalk.yellow(`Rating: ${rating}`))
} else {
console.log(chalk.green('No rating reduction'))
}

console.log('\n')
}
}
Loading
Loading