From 2e25f4e01854f2c3e2117715d31cb0c49f21a3ec Mon Sep 17 00:00:00 2001 From: phanirithvij Date: Mon, 16 Sep 2024 08:57:55 +0530 Subject: [PATCH 01/15] gitignore .direnv Signed-off-by: phanirithvij --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index f2ea7e0..75c407a 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,4 @@ /.envrc.private .terraform.lock.hcl **/.terraform +.direnv From 0afcb722676807b47e6e5081d5e9cf23da57436a Mon Sep 17 00:00:00 2001 From: phanirithvij Date: Mon, 16 Sep 2024 13:29:07 +0530 Subject: [PATCH 02/15] ci: add broken link checker action Signed-off-by: phanirithvij --- .github/workflows/check-dead-links.yml | 30 +++++++ checks/linkcheck/.envrc | 1 + checks/linkcheck/.gitignore | 5 ++ checks/linkcheck/README.md | 38 ++++++++ checks/linkcheck/allowed.links | 27 ++++++ checks/linkcheck/lychee.sh | 99 +++++++++++++++++++++ checks/linkcheck/main.py | 113 ++++++++++++++++++++++++ checks/linkcheck/pkgs/default.nix | 4 + checks/linkcheck/pkgs/wikiextractor.nix | 44 +++++++++ formatter.nix | 58 +++++++----- 10 files changed, 395 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/check-dead-links.yml create mode 100644 checks/linkcheck/.envrc create mode 100644 checks/linkcheck/.gitignore create mode 100644 checks/linkcheck/README.md create mode 100644 checks/linkcheck/allowed.links create mode 100755 checks/linkcheck/lychee.sh create mode 100644 checks/linkcheck/main.py create mode 100644 checks/linkcheck/pkgs/default.nix create mode 100644 checks/linkcheck/pkgs/wikiextractor.nix diff --git a/.github/workflows/check-dead-links.yml b/.github/workflows/check-dead-links.yml new file mode 100644 index 0000000..7acb455 --- /dev/null +++ b/.github/workflows/check-dead-links.yml @@ -0,0 +1,30 @@ +name: "check broken links in wiki" +on: + workflow_dispatch: + # twice per month + schedule: + - cron: "0 14 1,15 * *" +jobs: + lychee: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: cachix/install-nix-action@v27 + with: + nix_path: nixpkgs=channel:nixos-unstable + - run: ./checks/linkcheck/lychee.sh + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: upload lychee report as artifact + uses: actions/upload-artifact@v4 + with: + name: lychee-report + if-no-files-found: error + path: ./checks/linkcheck/lychee*-report + - name: upload filtered xmldump artifact + uses: actions/upload-artifact@v4 + with: + name: wikidump-filtered.xml + if-no-files-found: error + path: ./checks/linkcheck/workdir/wikidump-filtered.xml + compression-level: 9 diff --git a/checks/linkcheck/.envrc b/checks/linkcheck/.envrc new file mode 100644 index 0000000..48cb98d --- /dev/null +++ b/checks/linkcheck/.envrc @@ -0,0 +1 @@ +use flake .#linkcheck diff --git a/checks/linkcheck/.gitignore b/checks/linkcheck/.gitignore new file mode 100644 index 0000000..6f6c2b4 --- /dev/null +++ b/checks/linkcheck/.gitignore @@ -0,0 +1,5 @@ +temp +.direnv +*-report +result* +workdir diff --git a/checks/linkcheck/README.md b/checks/linkcheck/README.md new file mode 100644 index 0000000..4a1cb35 --- /dev/null +++ b/checks/linkcheck/README.md @@ -0,0 +1,38 @@ +## What + +wiki.nixos.org dead links checker gha powered by +[lychee](https://github.com/lycheeverse/lychee) + +Runs twice a week, can be adjusted in the github action cron job. Need to +manually edit links in wiki, nothing automated. + +Initial run gave ~100 results and were fixed manually, see the entries before 16 +Sep +[here](https://wiki.nixos.org/w/index.php?title=Special:Contributions/Phanirithvij&target=Phanirithvij&offset=&limit=100). + +## Why + +Dead links if detected early have a chance to prevent linkrot. + +- Why not use a broken-link-checker github action? + - wrote this so that it is not tied in to gha (works locally) + - gha will only call the script and upload artifact + +## Instructions + +```shell +cd ./checks/linkcheck +direnv allow # or # nix develop ..#linkcheck +./lychee.sh +``` + +It can be run from anywhere so `/path/to/checks/linkcheck/lychee.sh` works but +the report will be generated at `/path/to/checks/linkcheck`. + +As usual, `nix fmt` works inside linkcheck dir. + +## TODO/Roadmap + +- [ ] archive all links found in lychee scan (see lychee --dump) + - Since these links are prone to deletion it is our duty to archive them. + - There was a cli tool for this, forgot what it is, rediscover it diff --git a/checks/linkcheck/allowed.links b/checks/linkcheck/allowed.links new file mode 100644 index 0000000..bb1c731 --- /dev/null +++ b/checks/linkcheck/allowed.links @@ -0,0 +1,27 @@ +# an allowlist of known bad regexprs +# each line can be a comment or a regex or a full/partial url +# comments will be stripped out +# urls must be urlencoded +# stitched up into --exclude args per line and passed to lychee + +(http://(server|unix|hydra)|https://(cache|relay-server)/) + +# %7B is { matches urls inside nix expressions like ${version} etc. +(.*\.(domain|tld|mydomain|local).*|my.app|%7B) + +(qemu/1.0|locating-rules/1.0) + +# no need to fetch cache extries +(cache.nixos.org|fzakaria.cachix.org) + +# urls in example snippets +(USERNAME/nixpkgs|your_username/nixpkgs|fooUser/barRepo|code.visualstudio.com/sha|path/to/patch) + +# works in browser +https://www.phoronix.com/news/Mesa-Delete-Clover-Discussion + +# works with git clone +https://review.coreboot.org/coreboot.git + +# works in browser +https://pypi.org/project/stt/#files diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh new file mode 100755 index 0000000..eab8300 --- /dev/null +++ b/checks/linkcheck/lychee.sh @@ -0,0 +1,99 @@ +#! /usr/bin/env nix-shell +#! nix-shell -i bash -p wget p7zip bash findutils gnused coreutils lychee +# shellcheck shell=bash + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +workdir="$SCRIPT_DIR/workdir" +mkdir -p "$workdir" +pushd "$workdir" || exit + +wget -O wikidump.xml.zst "https://wiki.nixos.org/wikidump.xml.zst" +7z x -aoa wikidump.xml.zst + +# filter unimportant pages like User:* Talk:* +python ../main.py filter wikidump.xml wikidump-filtered.xml + +# generate exclude args from allowlist +python ../main.py badlinks ../allowed.links exclude-args + +# exlude sending requests to the wiki +echo "--exclude wiki.nixos.org/wiki" >>exclude-args +extrargs="$extrargs --exclude wiki.nixos.org/wiki" +excludeargs=$(cat exclude-args) + +# extract only the text from the filtered xml dump +nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml + +# lychee requires .md or .html format files to parse +find text -type f | grep -v .html | xargs -I{} mv {} "{}.html" + +# default is too high +extrargs="$extrargs --max-concurrency 16" + +# github_token from env or fallback to gh (local dev) +if [ -z "${GITHUB_TOKEN}" ]; then + if command -v gh -v &>/dev/null; then + echo using gh auth token + GITHUB_TOKEN=$(gh auth token) + fi +fi + +if [ -n "${GITHUB_TOKEN}" ]; then + echo using github token + extrargs="$extrargs --github-token $GITHUB_TOKEN" +fi + +# shellcheck disable=SC2086 +# fetch links +lychee -E \ + --cache --scheme http --scheme https \ + --include-verbatim $excludeargs $extrargs \ + text | + tee lychee.log + +# shellcheck disable=SC2086 +# get all links ignoring the allowlist (allowed.links) +lychee -E \ + --cache --scheme http --scheme https \ + --include-verbatim $extrargs \ + text | + tee lychee-full.log + +# shellcheck disable=SC2086 +# save fail_map so we can construct wiki link map to failed urls +lychee -E \ + --cache --scheme http --scheme https \ + --include-verbatim $excludeargs $extrargs \ + --format json \ + text >lychee.json + +# get archive suggestions +# --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501 +# TODO remove timeout command later after the issue is fixed +# shellcheck disable=SC2086 +timeout 30 lychee -E \ + --cache --scheme http --scheme https \ + --include-verbatim $excludeargs $extrargs \ + --suggest \ + text | + tee lychee-wayback.log + +# csv of status, url, corresponding wiki page link +python ../main.py dumplinkmap lychee.json failed-wiki-links.csv + +# sort for consistency +{ + head -n 1 failed-wiki-links.csv + tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2 +} >sorted_filename.tsv +mv sorted_filename.tsv failed-wiki-links.csv + +cat failed-wiki-links.csv + +dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report" +mkdir "$dest" +cp ../allowed.links lychee*.log failed-wiki-links.csv "$dest" + +popd || exit +#rm -rf "$workdir" diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py new file mode 100644 index 0000000..b8128b0 --- /dev/null +++ b/checks/linkcheck/main.py @@ -0,0 +1,113 @@ +import json +import csv +import re +import sys +import xml.etree.ElementTree as ET + + +# filter out unimportant pages like Talk:, User:, and old revisions of posts +def process_dump(dump_file, out_file): + tree = ET.parse(dump_file) + root = tree.getroot() + + ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"} + ET.register_namespace("", ns["mw"]) + + for page in root.findall("mw:page", ns): + title = page.find("mw:title", ns).text + + if title.startswith("User:") or title.startswith("Talk:"): + root.remove(page) + continue + + revisions = page.findall("mw:revision", ns) + + if len(revisions) > 1: + latest_revision = max( + revisions, key=lambda rev: rev.find("mw:timestamp", ns).text + ) + + # Remove all revisions except the latest one + for revision in revisions: + if revision != latest_revision: + page.remove(revision) + + tree.write(out_file, encoding="utf-8", xml_declaration=False) + + +def badlinks_print(known_file, outfile): + with open(known_file, "r") as infile, open(outfile, "w") as of: + for line in infile: + stripped_line = line.strip() + if stripped_line and not stripped_line.startswith("#"): + of.write(f"--exclude {stripped_line} ") + + +def dump_link_map(jsonfile, dumpfile): + with open(jsonfile, "r") as json_file: + fail_data = json.load(json_file) + + with open(dumpfile, mode="w", newline="", encoding="utf-8") as csv_file: + csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"') + csv_writer.writerow(["STATUS", "URL", "WIKIURL"]) + + for xml_file, failed_url_entries in fail_data["fail_map"].items(): + with open(xml_file, "r", encoding="utf-8") as xmlf: + root = ET.fromstring(f"{xmlf.read()}") + + for doc in root.findall("doc"): + title = doc.attrib.get("title") + title = re.sub(r"\s+", "_", title) + content = doc.text + + for entry in failed_url_entries: + url = entry["url"] + status = entry.get("status", {}).get("code", 403) + if url in content: + csv_writer.writerow( + [ + status, + url, + f"https://wiki.nixos.org/wiki/{title}", + ] + ) + + +def print_usage(status=0): + print( + """ +Usage: python main.py [action] + [action] what? + —————————————————————————————————————————————————————————— + filter dumpxmlfile outxmlfile filter out unncesscary pages from dump + badlinks badlinksfile outfile parse and print known allowed.links + dumplinkmap jsonfile outfilelinkmap dumps a map of url and nixos article where it is present + help prints this help message and exits +""" + ) + sys.exit(status) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print_usage(1) + action = sys.argv[1] + if action in "filter|badlinks|dumplinkmap": + if len(sys.argv) != 4: + print_usage(1) + if action == "filter": + dump_file = sys.argv[2] + out_file = sys.argv[3] + process_dump(dump_file, out_file) + elif action == "badlinks": + known_file = sys.argv[2] + out_file = sys.argv[3] + badlinks_print(known_file, out_file) + elif action == "dumplinkmap": + jsonfile = sys.argv[2] + dumpfile = sys.argv[3] + dump_link_map(jsonfile, dumpfile) + elif action in "--help": + print_usage(0) + else: + print_usage(1) diff --git a/checks/linkcheck/pkgs/default.nix b/checks/linkcheck/pkgs/default.nix new file mode 100644 index 0000000..ec85a5c --- /dev/null +++ b/checks/linkcheck/pkgs/default.nix @@ -0,0 +1,4 @@ +{ pkgs ? import { } }: +{ + wikiextractor = pkgs.callPackage ./wikiextractor.nix { }; +} diff --git a/checks/linkcheck/pkgs/wikiextractor.nix b/checks/linkcheck/pkgs/wikiextractor.nix new file mode 100644 index 0000000..ad58c0b --- /dev/null +++ b/checks/linkcheck/pkgs/wikiextractor.nix @@ -0,0 +1,44 @@ +{ lib +, python3 +, fetchpatch +, fetchFromGitHub +, +}: + +python3.pkgs.buildPythonApplication rec { + pname = "wikiextractor"; + version = "3.0.7"; + pyproject = true; + + src = fetchFromGitHub { + owner = "attardi"; + repo = "wikiextractor"; + rev = "v${version}"; + hash = "sha256-QeBC6ACHGKCSegd+wnOyIZI93L+f1EU62sFE0sAEwhU="; + }; + + build-system = [ + python3.pkgs.setuptools + python3.pkgs.wheel + ]; + + pythonImportsCheck = [ + "wikiextractor" + ]; + + patches = [ + # https://github.com/attardi/wikiextractor/issues/336#issuecomment-2322886454 + (fetchpatch { + url = "https://github.com/attardi/wikiextractor/commit/ab8988ebfa9e4557411f3d4c0f4ccda139e18875.patch"; + hash = "sha256-K1N6BA3FLieBTMIg9fyavc9ZajAr0vs754Nox53htmY="; + }) + ]; + + meta = { + description = "A tool for extracting plain text from Wikipedia dumps"; + homepage = "https://github.com/attardi/wikiextractor"; + license = lib.licenses.agpl3Only; + maintainers = with lib.maintainers; [ phanirithvij ]; + mainProgram = "wikiextractor"; + }; +} diff --git a/formatter.nix b/formatter.nix index 940a98b..5900b5e 100644 --- a/formatter.nix +++ b/formatter.nix @@ -29,32 +29,42 @@ ]; programs.shellcheck.enable = true; programs.deno.enable = true; + programs.black.enable = true; }; - packages.default = pkgs.mkShell { - packages = - let - convert2Tofu = - provider: - provider.override (prev: { - homepage = builtins.replaceStrings [ "registry.terraform.io/providers" ] [ - "registry.opentofu.org" + packages = { + default = pkgs.mkShell { + packages = + let + convert2Tofu = + provider: + provider.override (prev: { + homepage = builtins.replaceStrings [ "registry.terraform.io/providers" ] [ + "registry.opentofu.org" + ] + prev.homepage; + }); + in + [ + pkgs.bashInteractive + pkgs.sops + (pkgs.opentofu.withPlugins ( + p: + builtins.map convert2Tofu [ + p.hcloud + p.null + p.external + p.local ] - prev.homepage; - }); - in - [ - pkgs.bashInteractive - pkgs.sops - (pkgs.opentofu.withPlugins ( - p: - builtins.map convert2Tofu [ - p.hcloud - p.null - p.external - p.local - ] - )) - ]; + )) + ]; + }; + } + // (import ./checks/linkcheck/pkgs { inherit pkgs; }); + devShells.linkcheck = pkgs.mkShell { + packages = [ + pkgs.lychee + (pkgs.python3.withPackages (pypkgs: [ pypkgs.lxml ])) + ]; }; }; } From 4e0b102f88793b0976ca0064588cdc1f63e47239 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 09:53:41 +0200 Subject: [PATCH 03/15] linkcheck: replace wget/7z with curl/zstd --- checks/linkcheck/lychee.sh | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh index eab8300..e96ae16 100755 --- a/checks/linkcheck/lychee.sh +++ b/checks/linkcheck/lychee.sh @@ -1,5 +1,5 @@ #! /usr/bin/env nix-shell -#! nix-shell -i bash -p wget p7zip bash findutils gnused coreutils lychee +#! nix-shell -i bash -p curl zstd bash findutils gnused coreutils lychee # shellcheck shell=bash SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) @@ -8,8 +8,7 @@ workdir="$SCRIPT_DIR/workdir" mkdir -p "$workdir" pushd "$workdir" || exit -wget -O wikidump.xml.zst "https://wiki.nixos.org/wikidump.xml.zst" -7z x -aoa wikidump.xml.zst +curl "https://wiki.nixos.org/wikidump.xml.zst" | zstd -d >wikidump.xml # filter unimportant pages like User:* Talk:* python ../main.py filter wikidump.xml wikidump-filtered.xml From 81746a3ea83d81d30028c40bf13b16dc7619fa6d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 09:54:16 +0200 Subject: [PATCH 04/15] add missing python3 to nix shebang --- checks/linkcheck/lychee.sh | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh index e96ae16..6c63cd3 100755 --- a/checks/linkcheck/lychee.sh +++ b/checks/linkcheck/lychee.sh @@ -1,5 +1,5 @@ #! /usr/bin/env nix-shell -#! nix-shell -i bash -p curl zstd bash findutils gnused coreutils lychee +#! nix-shell -i bash -p python3 curl zstd bash findutils gnused coreutils lychee # shellcheck shell=bash SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) @@ -11,10 +11,10 @@ pushd "$workdir" || exit curl "https://wiki.nixos.org/wikidump.xml.zst" | zstd -d >wikidump.xml # filter unimportant pages like User:* Talk:* -python ../main.py filter wikidump.xml wikidump-filtered.xml +python3 ../main.py filter wikidump.xml wikidump-filtered.xml # generate exclude args from allowlist -python ../main.py badlinks ../allowed.links exclude-args +python3 ../main.py badlinks ../allowed.links exclude-args # exlude sending requests to the wiki echo "--exclude wiki.nixos.org/wiki" >>exclude-args @@ -79,7 +79,7 @@ timeout 30 lychee -E \ tee lychee-wayback.log # csv of status, url, corresponding wiki page link -python ../main.py dumplinkmap lychee.json failed-wiki-links.csv +python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv # sort for consistency { From 4f79bc4c707e17692874169677e502d7f2db39a8 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 09:56:49 +0200 Subject: [PATCH 05/15] linkcheck: add type annotations --- checks/linkcheck/main.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index b8128b0..8dc5270 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -3,10 +3,11 @@ import csv import re import sys import xml.etree.ElementTree as ET +from typing import NoReturn # filter out unimportant pages like Talk:, User:, and old revisions of posts -def process_dump(dump_file, out_file): +def process_dump(dump_file: str, out_file: str) -> None: tree = ET.parse(dump_file) root = tree.getroot() @@ -35,7 +36,7 @@ def process_dump(dump_file, out_file): tree.write(out_file, encoding="utf-8", xml_declaration=False) -def badlinks_print(known_file, outfile): +def badlinks_print(known_file: str, outfile: str) -> None: with open(known_file, "r") as infile, open(outfile, "w") as of: for line in infile: stripped_line = line.strip() @@ -43,7 +44,7 @@ def badlinks_print(known_file, outfile): of.write(f"--exclude {stripped_line} ") -def dump_link_map(jsonfile, dumpfile): +def dump_link_map(jsonfile: str, dumpfile: str) -> None: with open(jsonfile, "r") as json_file: fail_data = json.load(json_file) @@ -73,7 +74,7 @@ def dump_link_map(jsonfile, dumpfile): ) -def print_usage(status=0): +def print_usage(status: int = 0) -> NoReturn: print( """ Usage: python main.py [action] From 2d5336d3edf015d4bd90567b4bfaf9a3cd11a1ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:02:39 +0200 Subject: [PATCH 06/15] linkcheck: make xml parsing more robust and fix types --- checks/linkcheck/main.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index 8dc5270..4d00f78 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -6,6 +6,23 @@ import xml.etree.ElementTree as ET from typing import NoReturn +def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: + timestamp = revision.find("mw:timestamp", ns) + if timestamp is None: + print( + f"Timestamp tag not found in revision: {ET.tostring(revision)}", + file=sys.stderr, + ) + return "" + if timestamp.text is None: + print( + f"Timestamp text doesn't exist in revision: {ET.tostring(revision)}", + file=sys.stderr, + ) + return "" + return timestamp.text + + # filter out unimportant pages like Talk:, User:, and old revisions of posts def process_dump(dump_file: str, out_file: str) -> None: tree = ET.parse(dump_file) @@ -15,7 +32,17 @@ def process_dump(dump_file: str, out_file: str) -> None: ET.register_namespace("", ns["mw"]) for page in root.findall("mw:page", ns): - title = page.find("mw:title", ns).text + title_tag = page.find("mw:title", ns) + if title_tag is None: + print(f"Title tag not found in page: {ET.tostring(page)}", file=sys.stderr) + continue + title = title_tag.text + if title is None: + print( + f"Title text doesn't exist in page: {ET.tostring(page)}", + file=sys.stderr, + ) + continue if title.startswith("User:") or title.startswith("Talk:"): root.remove(page) @@ -25,7 +52,7 @@ def process_dump(dump_file: str, out_file: str) -> None: if len(revisions) > 1: latest_revision = max( - revisions, key=lambda rev: rev.find("mw:timestamp", ns).text + revisions, key=lambda revison: get_revision_timestamp(revison, ns) ) # Remove all revisions except the latest one @@ -58,6 +85,11 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None: for doc in root.findall("doc"): title = doc.attrib.get("title") + if title is None: + print( + f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr + ) + continue title = re.sub(r"\s+", "_", title) content = doc.text From f66c272fc2f78c62bd123b17703cb4fbc30c897f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:03:56 +0200 Subject: [PATCH 07/15] linkcheck: move argparsing to main function --- checks/linkcheck/main.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index 4d00f78..a7227c6 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -121,7 +121,7 @@ Usage: python main.py [action] sys.exit(status) -if __name__ == "__main__": +def main() -> None: if len(sys.argv) < 2: print_usage(1) action = sys.argv[1] @@ -144,3 +144,7 @@ if __name__ == "__main__": print_usage(0) else: print_usage(1) + + +if __name__ == "__main__": + main() From 7d16671ce2eba0277d71df62bceeb0440dd8942d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:07:55 +0200 Subject: [PATCH 08/15] linkcheck: add argparse --- checks/linkcheck/main.py | 80 +++++++++++++++++----------------------- 1 file changed, 34 insertions(+), 46 deletions(-) diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index a7227c6..c949bce 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -2,8 +2,8 @@ import json import csv import re import sys +import argparse import xml.etree.ElementTree as ET -from typing import NoReturn def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: @@ -24,8 +24,8 @@ def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: # filter out unimportant pages like Talk:, User:, and old revisions of posts -def process_dump(dump_file: str, out_file: str) -> None: - tree = ET.parse(dump_file) +def process_dump( args: argparse.Namespace) -> None: + tree = ET.parse(args.dump_file) root = tree.getroot() ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"} @@ -60,22 +60,23 @@ def process_dump(dump_file: str, out_file: str) -> None: if revision != latest_revision: page.remove(revision) - tree.write(out_file, encoding="utf-8", xml_declaration=False) + tree.write(args.out_file, encoding="utf-8", xml_declaration=False) -def badlinks_print(known_file: str, outfile: str) -> None: - with open(known_file, "r") as infile, open(outfile, "w") as of: +def badlinks_print(args: argparse.Namespace) -> None: + # known_file: str, outfile: str) -> None: + with open(args.known_file, "r") as infile, open(args.outfile, "w") as of: for line in infile: stripped_line = line.strip() if stripped_line and not stripped_line.startswith("#"): of.write(f"--exclude {stripped_line} ") -def dump_link_map(jsonfile: str, dumpfile: str) -> None: - with open(jsonfile, "r") as json_file: +def dump_link_map(args: argparse.Namespace) -> None: + with open(args.jsonfile, "r") as json_file: fail_data = json.load(json_file) - with open(dumpfile, mode="w", newline="", encoding="utf-8") as csv_file: + with open(args.dumpfile, mode="w", newline="", encoding="utf-8") as csv_file: csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"') csv_writer.writerow(["STATUS", "URL", "WIKIURL"]) @@ -106,44 +107,31 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None: ) -def print_usage(status: int = 0) -> NoReturn: - print( - """ -Usage: python main.py [action] - [action] what? - —————————————————————————————————————————————————————————— - filter dumpxmlfile outxmlfile filter out unncesscary pages from dump - badlinks badlinksfile outfile parse and print known allowed.links - dumplinkmap jsonfile outfilelinkmap dumps a map of url and nixos article where it is present - help prints this help message and exits -""" - ) - sys.exit(status) - - def main() -> None: - if len(sys.argv) < 2: - print_usage(1) - action = sys.argv[1] - if action in "filter|badlinks|dumplinkmap": - if len(sys.argv) != 4: - print_usage(1) - if action == "filter": - dump_file = sys.argv[2] - out_file = sys.argv[3] - process_dump(dump_file, out_file) - elif action == "badlinks": - known_file = sys.argv[2] - out_file = sys.argv[3] - badlinks_print(known_file, out_file) - elif action == "dumplinkmap": - jsonfile = sys.argv[2] - dumpfile = sys.argv[3] - dump_link_map(jsonfile, dumpfile) - elif action in "--help": - print_usage(0) - else: - print_usage(1) + parser = argparse.ArgumentParser(description="Process wiki dump files") + subparsers = parser.add_subparsers() + parser_filter = subparsers.add_parser("filter", help="Filter out unimportant pages") + + parser_filter.add_argument("dump_file", type=str) + parser_filter.add_argument("out_file", type=str) + parser_filter.set_defaults(func=process_dump) + + parser_badlinks = subparsers.add_parser( + "badlinks", help="Parse and print known allowed links" + ) + parser_badlinks.add_argument("known_file", type=str) + parser_badlinks.add_argument("out_file", type=str) + parser_badlinks.set_defaults(func=badlinks_print) + + parser_dumplinkmap = subparsers.add_parser( + "dumplinkmap", help="Dump a map of url and nixos article where it is present" + ) + parser_dumplinkmap.add_argument("jsonfile", type=str) + parser_dumplinkmap.add_argument("dumpfile", type=str) + parser_dumplinkmap.set_defaults(func=dump_link_map) + + args = parser.parse_args() + args.func(args) if __name__ == "__main__": From 197dc548645ede3715b139a74c63718e0f05da79 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:12:52 +0200 Subject: [PATCH 09/15] linkcheck: use pathlib --- checks/linkcheck/main.py | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index c949bce..5878a4b 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -4,6 +4,7 @@ import re import sys import argparse import xml.etree.ElementTree as ET +from pathlib import Path def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: @@ -24,8 +25,8 @@ def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: # filter out unimportant pages like Talk:, User:, and old revisions of posts -def process_dump( args: argparse.Namespace) -> None: - tree = ET.parse(args.dump_file) +def process_dump(args: argparse.Namespace) -> None: + tree = ET.parse(str(args.dump_file)) root = tree.getroot() ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"} @@ -60,12 +61,11 @@ def process_dump( args: argparse.Namespace) -> None: if revision != latest_revision: page.remove(revision) - tree.write(args.out_file, encoding="utf-8", xml_declaration=False) + tree.write(str(args.out_file), encoding="utf-8", xml_declaration=False) def badlinks_print(args: argparse.Namespace) -> None: - # known_file: str, outfile: str) -> None: - with open(args.known_file, "r") as infile, open(args.outfile, "w") as of: + with args.known_file.open() as infile, args.out_file.open("w") as of: for line in infile: stripped_line = line.strip() if stripped_line and not stripped_line.startswith("#"): @@ -73,10 +73,9 @@ def badlinks_print(args: argparse.Namespace) -> None: def dump_link_map(args: argparse.Namespace) -> None: - with open(args.jsonfile, "r") as json_file: - fail_data = json.load(json_file) + fail_data = json.loads(args.json_file.read_text()) - with open(args.dumpfile, mode="w", newline="", encoding="utf-8") as csv_file: + with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file: csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"') csv_writer.writerow(["STATUS", "URL", "WIKIURL"]) @@ -112,22 +111,22 @@ def main() -> None: subparsers = parser.add_subparsers() parser_filter = subparsers.add_parser("filter", help="Filter out unimportant pages") - parser_filter.add_argument("dump_file", type=str) - parser_filter.add_argument("out_file", type=str) + parser_filter.add_argument("dump_file", type=Path) + parser_filter.add_argument("out_file", type=Path) parser_filter.set_defaults(func=process_dump) parser_badlinks = subparsers.add_parser( "badlinks", help="Parse and print known allowed links" ) - parser_badlinks.add_argument("known_file", type=str) - parser_badlinks.add_argument("out_file", type=str) + parser_badlinks.add_argument("known_file", type=Path) + parser_badlinks.add_argument("out_file", type=Path) parser_badlinks.set_defaults(func=badlinks_print) parser_dumplinkmap = subparsers.add_parser( "dumplinkmap", help="Dump a map of url and nixos article where it is present" ) - parser_dumplinkmap.add_argument("jsonfile", type=str) - parser_dumplinkmap.add_argument("dumpfile", type=str) + parser_dumplinkmap.add_argument("json_file", type=Path) + parser_dumplinkmap.add_argument("dump_file", type=Path) parser_dumplinkmap.set_defaults(func=dump_link_map) args = parser.parse_args() From 94429be77fb2ca7ddeab6ae0ab148124684845a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:25:16 +0200 Subject: [PATCH 10/15] linkcheck: allow to re-run report --- checks/linkcheck/lychee.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh index 6c63cd3..fea01ae 100755 --- a/checks/linkcheck/lychee.sh +++ b/checks/linkcheck/lychee.sh @@ -91,7 +91,7 @@ mv sorted_filename.tsv failed-wiki-links.csv cat failed-wiki-links.csv dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report" -mkdir "$dest" +mkdir -p "$dest" cp ../allowed.links lychee*.log failed-wiki-links.csv "$dest" popd || exit From ff19131911dd30e20672c733aadd3d5516b68b46 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:27:21 +0200 Subject: [PATCH 11/15] linkcheck: add lxml to shebang --- checks/linkcheck/lychee.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh index fea01ae..ac607e4 100755 --- a/checks/linkcheck/lychee.sh +++ b/checks/linkcheck/lychee.sh @@ -1,5 +1,5 @@ #! /usr/bin/env nix-shell -#! nix-shell -i bash -p python3 curl zstd bash findutils gnused coreutils lychee +#! nix-shell -i bash -p 'python3.withPackages (ps: with ps; [ lxml ])' curl zstd bash findutils gnused coreutils lychee # shellcheck shell=bash SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) From 2be4de6fc69d0f7ecd6bc52c040c283c44dccb64 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:28:10 +0200 Subject: [PATCH 12/15] linkcheck: simplify instructions --- checks/linkcheck/README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/checks/linkcheck/README.md b/checks/linkcheck/README.md index 4a1cb35..27cde74 100644 --- a/checks/linkcheck/README.md +++ b/checks/linkcheck/README.md @@ -21,13 +21,10 @@ Dead links if detected early have a chance to prevent linkrot. ## Instructions ```shell -cd ./checks/linkcheck -direnv allow # or # nix develop ..#linkcheck -./lychee.sh +./checks/linkcheck/lychee.sh ``` -It can be run from anywhere so `/path/to/checks/linkcheck/lychee.sh` works but -the report will be generated at `/path/to/checks/linkcheck`. +The report will be generated at `/path/to/checks/linkcheck`. As usual, `nix fmt` works inside linkcheck dir. From 7c9a68ff76fd54b54f03277554439ee1da86a1e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:37:30 +0200 Subject: [PATCH 13/15] linkcheck: sort csv in python --- checks/linkcheck/lychee.sh | 7 ----- checks/linkcheck/main.py | 58 +++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh index ac607e4..8d3aba0 100755 --- a/checks/linkcheck/lychee.sh +++ b/checks/linkcheck/lychee.sh @@ -81,13 +81,6 @@ timeout 30 lychee -E \ # csv of status, url, corresponding wiki page link python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv -# sort for consistency -{ - head -n 1 failed-wiki-links.csv - tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2 -} >sorted_filename.tsv -mv sorted_filename.tsv failed-wiki-links.csv - cat failed-wiki-links.csv dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report" diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index 5878a4b..02d0f64 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -3,6 +3,7 @@ import csv import re import sys import argparse +import bisect import xml.etree.ElementTree as ET from pathlib import Path @@ -72,38 +73,43 @@ def badlinks_print(args: argparse.Namespace) -> None: of.write(f"--exclude {stripped_line} ") +def read_lychee_file(lychee_file: Path) -> list[list[str]]: + fail_data = json.loads(lychee_file.read_text()) + failed_urls = [] + for xml_file, failed_url_entries in fail_data["fail_map"].items(): + with open(xml_file, "r", encoding="utf-8") as xmlf: + root = ET.fromstring(f"{xmlf.read()}") + for doc in root.findall("doc"): + title = doc.attrib.get("title") + if title is None: + print(f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr) + continue + title = re.sub(r"\s+", "_", title) + content = doc.text + for entry in failed_url_entries: + url = entry["url"] + status = entry.get("status", {}).get("code", 403) + if url in content: + bisect.insort( + failed_urls, + [ + status, + url, + f"https://wiki.nixos.org/wiki/{title}", + ], + ) + return failed_urls + + def dump_link_map(args: argparse.Namespace) -> None: - fail_data = json.loads(args.json_file.read_text()) + failed_urls = read_lychee_file(args.json_file) with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file: csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"') csv_writer.writerow(["STATUS", "URL", "WIKIURL"]) - for xml_file, failed_url_entries in fail_data["fail_map"].items(): - with open(xml_file, "r", encoding="utf-8") as xmlf: - root = ET.fromstring(f"{xmlf.read()}") - - for doc in root.findall("doc"): - title = doc.attrib.get("title") - if title is None: - print( - f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr - ) - continue - title = re.sub(r"\s+", "_", title) - content = doc.text - - for entry in failed_url_entries: - url = entry["url"] - status = entry.get("status", {}).get("code", 403) - if url in content: - csv_writer.writerow( - [ - status, - url, - f"https://wiki.nixos.org/wiki/{title}", - ] - ) + for item in failed_urls: + csv_writer.writerow(item) def main() -> None: From 91b55c69428bfe8e906bff6f087ac48ff4cddf52 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:54:05 +0200 Subject: [PATCH 14/15] linkcheck: use array to pass args --- checks/linkcheck/lychee.sh | 30 +++++++++++++----------------- 1 file changed, 13 insertions(+), 17 deletions(-) diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh index 8d3aba0..9fca97b 100755 --- a/checks/linkcheck/lychee.sh +++ b/checks/linkcheck/lychee.sh @@ -16,19 +16,19 @@ python3 ../main.py filter wikidump.xml wikidump-filtered.xml # generate exclude args from allowlist python3 ../main.py badlinks ../allowed.links exclude-args -# exlude sending requests to the wiki -echo "--exclude wiki.nixos.org/wiki" >>exclude-args -extrargs="$extrargs --exclude wiki.nixos.org/wiki" -excludeargs=$(cat exclude-args) +extrargs=( + # exlude sending requests to the wiki + "--exclude" "wiki.nixos.org/wiki" + # default is too high + "--max-concurrency" "16" +) +read -r -a excludeargs <<<"$(lychee.json # get archive suggestions # --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501 # TODO remove timeout command later after the issue is fixed -# shellcheck disable=SC2086 timeout 30 lychee -E \ --cache --scheme http --scheme https \ - --include-verbatim $excludeargs $extrargs \ + --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \ --suggest \ text | tee lychee-wayback.log From 01b85c1ee5fdb67b1aa4ab344ac3d15d2f834144 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:55:17 +0200 Subject: [PATCH 15/15] drop black as formatter since we already use ruff --- formatter.nix | 1 - 1 file changed, 1 deletion(-) diff --git a/formatter.nix b/formatter.nix index 5900b5e..319e4e0 100644 --- a/formatter.nix +++ b/formatter.nix @@ -29,7 +29,6 @@ ]; programs.shellcheck.enable = true; programs.deno.enable = true; - programs.black.enable = true; }; packages = { default = pkgs.mkShell {