import-wikimedia.mjs script

This commit is contained in:
John Livingston 2023-09-25 17:12:10 +02:00
parent 6705cd9e3d
commit d7d7b9e1df
No known key found for this signature in database
GPG Key ID: B17B5640CE66CDBC
5 changed files with 99 additions and 0 deletions

1
package-lock.json generated
View File

@ -30,6 +30,7 @@
"@types/winston": "^2.4.4", "@types/winston": "^2.4.4",
"@typescript-eslint/eslint-plugin": "^4.29.0", "@typescript-eslint/eslint-plugin": "^4.29.0",
"@typescript-eslint/parser": "^4.29.0", "@typescript-eslint/parser": "^4.29.0",
"commander": "^11.0.0",
"esbuild": "^0.16.1", "esbuild": "^0.16.1",
"eslint": "^7.32.0", "eslint": "^7.32.0",
"eslint-config-standard": "^16.0.3", "eslint-config-standard": "^16.0.3",

View File

@ -54,6 +54,7 @@
"@types/winston": "^2.4.4", "@types/winston": "^2.4.4",
"@typescript-eslint/eslint-plugin": "^4.29.0", "@typescript-eslint/eslint-plugin": "^4.29.0",
"@typescript-eslint/parser": "^4.29.0", "@typescript-eslint/parser": "^4.29.0",
"commander": "^11.0.0",
"esbuild": "^0.16.1", "esbuild": "^0.16.1",
"eslint": "^7.32.0", "eslint": "^7.32.0",
"eslint-config-standard": "^16.0.3", "eslint-config-standard": "^16.0.3",

View File

@ -5,6 +5,12 @@ This page lists some common options you can use to configure the bot forbidden w
Note: this page is still "Work In Progress". Fill free to contribute, by suggesting new entries on the Note: this page is still "Work In Progress". Fill free to contribute, by suggesting new entries on the
[livechat plugin github page](https://github.com/JohnXLivingston/peertube-plugin-livechat). [livechat plugin github page](https://github.com/JohnXLivingston/peertube-plugin-livechat).
## Importing rules from wiktionary
There is a [wiktionary import script](./import-wikimedia.md) that can list all page title in a witionnary categorie.
This script was used to generate some of the forbidden-words lists that are present in this repository.
In such case, the command is provided near to the word list.
## URLs ## URLs
If you want to prevent users to send URLs in the chat, you can use following values for the bot configuration. If you want to prevent users to send URLs in the chat, you can use following values for the bot configuration.

View File

@ -0,0 +1,27 @@
# import-wikimedia.mjs
The [import-wikimedia.mjs](./import-wikimedia.mjs) script can be used to generate some word lists.
It can parse some categories from the [wiktionary](https://www.wiktionary.org), and return the result as a flat text list.
This list can be pasted in the "forbidden words" field (do not forget to uncheck "consider as a regular expression").
## Pre-requisite
To use it, you have to install NodeJS (version >= 16) on your computer.
You also need the `commander` package. To get it, you have to choice:
* you can install the plugin dev dependencies (`npm install` if your are not on the production server)
* or `npm install -g commander`
## Usage
To run the script:
```bash
# commands to run from the livechat plugin directory.
# to get the script help:
node ./support/forbidden_words/import-wikimedia.mjs --help
# to get a category content, and save it to /tmp/result.txt:
node ./support/forbidden_words/import-wikimedia.mjs category --lang fr --service wiktionary --category 'Insultes_en_français' > /tmp/result.txt
```

View File

@ -0,0 +1,64 @@
import { Command } from 'commander'
const program = new Command()
program
.usage('[command] [options]')
.showHelpAfterError()
const runCommand = program.command('category')
runCommand.description('Loads data from wikimedia services (wiktionary, ...), and prints it on the standard output.')
runCommand.requiredOption('-c, --category <category>', 'The category to request.')
runCommand.option(
'-s, --service <service>',
'The service to query. Possible values: "wiktionary", "wikipedia". Default: "wiktionary".',
'wiktionary'
)
runCommand.option(
'-l, --lang <lang>',
'The code lang for the requested service. For example "en", "fr", ... (as it appears in the wikimedia site url). Default: "en"',
'en'
)
runCommand.action(async (options) => {
const service = options.service ?? 'wiktionary'
if (service !== 'wiktionary' && service !== 'wikipedia') {
throw new Error('Invalid service ' + service)
}
const lang = options.lang
if (!/^[a-z-]+$/.test(lang)) {
throw new Error('Invalid lang ' + lang)
}
const category = options.category
const url = new URL(`https://${lang}.${service}.org/w/api.php?action=query&format=json&formatversion=2&list=categorymembers&cmlimit=200`)
url.searchParams.append('cmtitle', 'Category:' + category)
while (true) {
const r = await fetch(url.toString())
if (!r.ok) {
throw new Error('Failed requesting ' + url)
}
const json = await r.json()
if (!json) {
throw new Error('Invalid JSON content')
}
const list = json.query.categorymembers
for (const entry of list) {
// Note: at the end, there might be some links to other categories.
// These links have title like "Category:xxx", or "Catégorie:xxx".
// We don't want to test every translation of "Category"...
// So we just ignore titles with ":"
if (entry.title.includes(':')) { continue }
console.log(entry.title)
}
if (json.continue) {
for (const k in json.continue) {
url.searchParams.set(k, json.continue[k])
}
continue
}
break
}
})
program.parse(process.argv)