From d7d7b9e1df536cbe3847a54e8a1b663e15419bf1 Mon Sep 17 00:00:00 2001 From: John Livingston Date: Mon, 25 Sep 2023 17:12:10 +0200 Subject: [PATCH] import-wikimedia.mjs script --- package-lock.json | 1 + package.json | 1 + support/forbidden_words/README.md | 6 ++ support/forbidden_words/import-wikimedia.md | 27 +++++++++ support/forbidden_words/import-wikimedia.mjs | 64 ++++++++++++++++++++ 5 files changed, 99 insertions(+) create mode 100644 support/forbidden_words/import-wikimedia.md create mode 100644 support/forbidden_words/import-wikimedia.mjs diff --git a/package-lock.json b/package-lock.json index a4d5e2cd..9bb7c859 100644 --- a/package-lock.json +++ b/package-lock.json @@ -30,6 +30,7 @@ "@types/winston": "^2.4.4", "@typescript-eslint/eslint-plugin": "^4.29.0", "@typescript-eslint/parser": "^4.29.0", + "commander": "^11.0.0", "esbuild": "^0.16.1", "eslint": "^7.32.0", "eslint-config-standard": "^16.0.3", diff --git a/package.json b/package.json index 39f83b6f..e53b95ac 100644 --- a/package.json +++ b/package.json @@ -54,6 +54,7 @@ "@types/winston": "^2.4.4", "@typescript-eslint/eslint-plugin": "^4.29.0", "@typescript-eslint/parser": "^4.29.0", + "commander": "^11.0.0", "esbuild": "^0.16.1", "eslint": "^7.32.0", "eslint-config-standard": "^16.0.3", diff --git a/support/forbidden_words/README.md b/support/forbidden_words/README.md index 3950642d..448c7c40 100644 --- a/support/forbidden_words/README.md +++ b/support/forbidden_words/README.md @@ -5,6 +5,12 @@ This page lists some common options you can use to configure the bot forbidden w Note: this page is still "Work In Progress". Fill free to contribute, by suggesting new entries on the [livechat plugin github page](https://github.com/JohnXLivingston/peertube-plugin-livechat). +## Importing rules from wiktionary + +There is a [wiktionary import script](./import-wikimedia.md) that can list all page title in a witionnary categorie. +This script was used to generate some of the forbidden-words lists that are present in this repository. +In such case, the command is provided near to the word list. + ## URLs If you want to prevent users to send URLs in the chat, you can use following values for the bot configuration. diff --git a/support/forbidden_words/import-wikimedia.md b/support/forbidden_words/import-wikimedia.md new file mode 100644 index 00000000..682e2cb2 --- /dev/null +++ b/support/forbidden_words/import-wikimedia.md @@ -0,0 +1,27 @@ +# import-wikimedia.mjs + +The [import-wikimedia.mjs](./import-wikimedia.mjs) script can be used to generate some word lists. + +It can parse some categories from the [wiktionary](https://www.wiktionary.org), and return the result as a flat text list. +This list can be pasted in the "forbidden words" field (do not forget to uncheck "consider as a regular expression"). + +## Pre-requisite + +To use it, you have to install NodeJS (version >= 16) on your computer. + +You also need the `commander` package. To get it, you have to choice: + +* you can install the plugin dev dependencies (`npm install` if your are not on the production server) +* or `npm install -g commander` + +## Usage + +To run the script: + +```bash +# commands to run from the livechat plugin directory. +# to get the script help: +node ./support/forbidden_words/import-wikimedia.mjs --help +# to get a category content, and save it to /tmp/result.txt: +node ./support/forbidden_words/import-wikimedia.mjs category --lang fr --service wiktionary --category 'Insultes_en_français' > /tmp/result.txt +``` diff --git a/support/forbidden_words/import-wikimedia.mjs b/support/forbidden_words/import-wikimedia.mjs new file mode 100644 index 00000000..6596027a --- /dev/null +++ b/support/forbidden_words/import-wikimedia.mjs @@ -0,0 +1,64 @@ +import { Command } from 'commander' + +const program = new Command() +program + .usage('[command] [options]') + .showHelpAfterError() + +const runCommand = program.command('category') +runCommand.description('Loads data from wikimedia services (wiktionary, ...), and prints it on the standard output.') +runCommand.requiredOption('-c, --category ', 'The category to request.') +runCommand.option( + '-s, --service ', + 'The service to query. Possible values: "wiktionary", "wikipedia". Default: "wiktionary".', + 'wiktionary' +) +runCommand.option( + '-l, --lang ', + 'The code lang for the requested service. For example "en", "fr", ... (as it appears in the wikimedia site url). Default: "en"', + 'en' +) +runCommand.action(async (options) => { + const service = options.service ?? 'wiktionary' + if (service !== 'wiktionary' && service !== 'wikipedia') { + throw new Error('Invalid service ' + service) + } + const lang = options.lang + if (!/^[a-z-]+$/.test(lang)) { + throw new Error('Invalid lang ' + lang) + } + const category = options.category + + const url = new URL(`https://${lang}.${service}.org/w/api.php?action=query&format=json&formatversion=2&list=categorymembers&cmlimit=200`) + url.searchParams.append('cmtitle', 'Category:' + category) + + while (true) { + const r = await fetch(url.toString()) + if (!r.ok) { + throw new Error('Failed requesting ' + url) + } + const json = await r.json() + if (!json) { + throw new Error('Invalid JSON content') + } + const list = json.query.categorymembers + for (const entry of list) { + // Note: at the end, there might be some links to other categories. + // These links have title like "Category:xxx", or "Catégorie:xxx". + // We don't want to test every translation of "Category"... + // So we just ignore titles with ":" + if (entry.title.includes(':')) { continue } + console.log(entry.title) + } + + if (json.continue) { + for (const k in json.continue) { + url.searchParams.set(k, json.continue[k]) + } + continue + } + break + } +}) + +program.parse(process.argv)