From 0afcb722676807b47e6e5081d5e9cf23da57436a Mon Sep 17 00:00:00 2001 From: phanirithvij Date: Mon, 16 Sep 2024 13:29:07 +0530 Subject: [PATCH] ci: add broken link checker action Signed-off-by: phanirithvij --- .github/workflows/check-dead-links.yml | 30 +++++++ checks/linkcheck/.envrc | 1 + checks/linkcheck/.gitignore | 5 ++ checks/linkcheck/README.md | 38 ++++++++ checks/linkcheck/allowed.links | 27 ++++++ checks/linkcheck/lychee.sh | 99 +++++++++++++++++++++ checks/linkcheck/main.py | 113 ++++++++++++++++++++++++ checks/linkcheck/pkgs/default.nix | 4 + checks/linkcheck/pkgs/wikiextractor.nix | 44 +++++++++ formatter.nix | 58 +++++++----- 10 files changed, 395 insertions(+), 24 deletions(-) create mode 100644 .github/workflows/check-dead-links.yml create mode 100644 checks/linkcheck/.envrc create mode 100644 checks/linkcheck/.gitignore create mode 100644 checks/linkcheck/README.md create mode 100644 checks/linkcheck/allowed.links create mode 100755 checks/linkcheck/lychee.sh create mode 100644 checks/linkcheck/main.py create mode 100644 checks/linkcheck/pkgs/default.nix create mode 100644 checks/linkcheck/pkgs/wikiextractor.nix diff --git a/.github/workflows/check-dead-links.yml b/.github/workflows/check-dead-links.yml new file mode 100644 index 0000000..7acb455 --- /dev/null +++ b/.github/workflows/check-dead-links.yml @@ -0,0 +1,30 @@ +name: "check broken links in wiki" +on: + workflow_dispatch: + # twice per month + schedule: + - cron: "0 14 1,15 * *" +jobs: + lychee: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: cachix/install-nix-action@v27 + with: + nix_path: nixpkgs=channel:nixos-unstable + - run: ./checks/linkcheck/lychee.sh + env: + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + - name: upload lychee report as artifact + uses: actions/upload-artifact@v4 + with: + name: lychee-report + if-no-files-found: error + path: ./checks/linkcheck/lychee*-report + - name: upload filtered xmldump artifact + uses: actions/upload-artifact@v4 + with: + name: wikidump-filtered.xml + if-no-files-found: error + path: ./checks/linkcheck/workdir/wikidump-filtered.xml + compression-level: 9 diff --git a/checks/linkcheck/.envrc b/checks/linkcheck/.envrc new file mode 100644 index 0000000..48cb98d --- /dev/null +++ b/checks/linkcheck/.envrc @@ -0,0 +1 @@ +use flake .#linkcheck diff --git a/checks/linkcheck/.gitignore b/checks/linkcheck/.gitignore new file mode 100644 index 0000000..6f6c2b4 --- /dev/null +++ b/checks/linkcheck/.gitignore @@ -0,0 +1,5 @@ +temp +.direnv +*-report +result* +workdir diff --git a/checks/linkcheck/README.md b/checks/linkcheck/README.md new file mode 100644 index 0000000..4a1cb35 --- /dev/null +++ b/checks/linkcheck/README.md @@ -0,0 +1,38 @@ +## What + +wiki.nixos.org dead links checker gha powered by +[lychee](https://github.com/lycheeverse/lychee) + +Runs twice a week, can be adjusted in the github action cron job. Need to +manually edit links in wiki, nothing automated. + +Initial run gave ~100 results and were fixed manually, see the entries before 16 +Sep +[here](https://wiki.nixos.org/w/index.php?title=Special:Contributions/Phanirithvij&target=Phanirithvij&offset=&limit=100). + +## Why + +Dead links if detected early have a chance to prevent linkrot. + +- Why not use a broken-link-checker github action? + - wrote this so that it is not tied in to gha (works locally) + - gha will only call the script and upload artifact + +## Instructions + +```shell +cd ./checks/linkcheck +direnv allow # or # nix develop ..#linkcheck +./lychee.sh +``` + +It can be run from anywhere so `/path/to/checks/linkcheck/lychee.sh` works but +the report will be generated at `/path/to/checks/linkcheck`. + +As usual, `nix fmt` works inside linkcheck dir. + +## TODO/Roadmap + +- [ ] archive all links found in lychee scan (see lychee --dump) + - Since these links are prone to deletion it is our duty to archive them. + - There was a cli tool for this, forgot what it is, rediscover it diff --git a/checks/linkcheck/allowed.links b/checks/linkcheck/allowed.links new file mode 100644 index 0000000..bb1c731 --- /dev/null +++ b/checks/linkcheck/allowed.links @@ -0,0 +1,27 @@ +# an allowlist of known bad regexprs +# each line can be a comment or a regex or a full/partial url +# comments will be stripped out +# urls must be urlencoded +# stitched up into --exclude args per line and passed to lychee + +(http://(server|unix|hydra)|https://(cache|relay-server)/) + +# %7B is { matches urls inside nix expressions like ${version} etc. +(.*\.(domain|tld|mydomain|local).*|my.app|%7B) + +(qemu/1.0|locating-rules/1.0) + +# no need to fetch cache extries +(cache.nixos.org|fzakaria.cachix.org) + +# urls in example snippets +(USERNAME/nixpkgs|your_username/nixpkgs|fooUser/barRepo|code.visualstudio.com/sha|path/to/patch) + +# works in browser +https://www.phoronix.com/news/Mesa-Delete-Clover-Discussion + +# works with git clone +https://review.coreboot.org/coreboot.git + +# works in browser +https://pypi.org/project/stt/#files diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh new file mode 100755 index 0000000..eab8300 --- /dev/null +++ b/checks/linkcheck/lychee.sh @@ -0,0 +1,99 @@ +#! /usr/bin/env nix-shell +#! nix-shell -i bash -p wget p7zip bash findutils gnused coreutils lychee +# shellcheck shell=bash + +SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd) + +workdir="$SCRIPT_DIR/workdir" +mkdir -p "$workdir" +pushd "$workdir" || exit + +wget -O wikidump.xml.zst "https://wiki.nixos.org/wikidump.xml.zst" +7z x -aoa wikidump.xml.zst + +# filter unimportant pages like User:* Talk:* +python ../main.py filter wikidump.xml wikidump-filtered.xml + +# generate exclude args from allowlist +python ../main.py badlinks ../allowed.links exclude-args + +# exlude sending requests to the wiki +echo "--exclude wiki.nixos.org/wiki" >>exclude-args +extrargs="$extrargs --exclude wiki.nixos.org/wiki" +excludeargs=$(cat exclude-args) + +# extract only the text from the filtered xml dump +nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml + +# lychee requires .md or .html format files to parse +find text -type f | grep -v .html | xargs -I{} mv {} "{}.html" + +# default is too high +extrargs="$extrargs --max-concurrency 16" + +# github_token from env or fallback to gh (local dev) +if [ -z "${GITHUB_TOKEN}" ]; then + if command -v gh -v &>/dev/null; then + echo using gh auth token + GITHUB_TOKEN=$(gh auth token) + fi +fi + +if [ -n "${GITHUB_TOKEN}" ]; then + echo using github token + extrargs="$extrargs --github-token $GITHUB_TOKEN" +fi + +# shellcheck disable=SC2086 +# fetch links +lychee -E \ + --cache --scheme http --scheme https \ + --include-verbatim $excludeargs $extrargs \ + text | + tee lychee.log + +# shellcheck disable=SC2086 +# get all links ignoring the allowlist (allowed.links) +lychee -E \ + --cache --scheme http --scheme https \ + --include-verbatim $extrargs \ + text | + tee lychee-full.log + +# shellcheck disable=SC2086 +# save fail_map so we can construct wiki link map to failed urls +lychee -E \ + --cache --scheme http --scheme https \ + --include-verbatim $excludeargs $extrargs \ + --format json \ + text >lychee.json + +# get archive suggestions +# --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501 +# TODO remove timeout command later after the issue is fixed +# shellcheck disable=SC2086 +timeout 30 lychee -E \ + --cache --scheme http --scheme https \ + --include-verbatim $excludeargs $extrargs \ + --suggest \ + text | + tee lychee-wayback.log + +# csv of status, url, corresponding wiki page link +python ../main.py dumplinkmap lychee.json failed-wiki-links.csv + +# sort for consistency +{ + head -n 1 failed-wiki-links.csv + tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2 +} >sorted_filename.tsv +mv sorted_filename.tsv failed-wiki-links.csv + +cat failed-wiki-links.csv + +dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report" +mkdir "$dest" +cp ../allowed.links lychee*.log failed-wiki-links.csv "$dest" + +popd || exit +#rm -rf "$workdir" diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py new file mode 100644 index 0000000..b8128b0 --- /dev/null +++ b/checks/linkcheck/main.py @@ -0,0 +1,113 @@ +import json +import csv +import re +import sys +import xml.etree.ElementTree as ET + + +# filter out unimportant pages like Talk:, User:, and old revisions of posts +def process_dump(dump_file, out_file): + tree = ET.parse(dump_file) + root = tree.getroot() + + ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"} + ET.register_namespace("", ns["mw"]) + + for page in root.findall("mw:page", ns): + title = page.find("mw:title", ns).text + + if title.startswith("User:") or title.startswith("Talk:"): + root.remove(page) + continue + + revisions = page.findall("mw:revision", ns) + + if len(revisions) > 1: + latest_revision = max( + revisions, key=lambda rev: rev.find("mw:timestamp", ns).text + ) + + # Remove all revisions except the latest one + for revision in revisions: + if revision != latest_revision: + page.remove(revision) + + tree.write(out_file, encoding="utf-8", xml_declaration=False) + + +def badlinks_print(known_file, outfile): + with open(known_file, "r") as infile, open(outfile, "w") as of: + for line in infile: + stripped_line = line.strip() + if stripped_line and not stripped_line.startswith("#"): + of.write(f"--exclude {stripped_line} ") + + +def dump_link_map(jsonfile, dumpfile): + with open(jsonfile, "r") as json_file: + fail_data = json.load(json_file) + + with open(dumpfile, mode="w", newline="", encoding="utf-8") as csv_file: + csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"') + csv_writer.writerow(["STATUS", "URL", "WIKIURL"]) + + for xml_file, failed_url_entries in fail_data["fail_map"].items(): + with open(xml_file, "r", encoding="utf-8") as xmlf: + root = ET.fromstring(f"{xmlf.read()}") + + for doc in root.findall("doc"): + title = doc.attrib.get("title") + title = re.sub(r"\s+", "_", title) + content = doc.text + + for entry in failed_url_entries: + url = entry["url"] + status = entry.get("status", {}).get("code", 403) + if url in content: + csv_writer.writerow( + [ + status, + url, + f"https://wiki.nixos.org/wiki/{title}", + ] + ) + + +def print_usage(status=0): + print( + """ +Usage: python main.py [action] + [action] what? + —————————————————————————————————————————————————————————— + filter dumpxmlfile outxmlfile filter out unncesscary pages from dump + badlinks badlinksfile outfile parse and print known allowed.links + dumplinkmap jsonfile outfilelinkmap dumps a map of url and nixos article where it is present + help prints this help message and exits +""" + ) + sys.exit(status) + + +if __name__ == "__main__": + if len(sys.argv) < 2: + print_usage(1) + action = sys.argv[1] + if action in "filter|badlinks|dumplinkmap": + if len(sys.argv) != 4: + print_usage(1) + if action == "filter": + dump_file = sys.argv[2] + out_file = sys.argv[3] + process_dump(dump_file, out_file) + elif action == "badlinks": + known_file = sys.argv[2] + out_file = sys.argv[3] + badlinks_print(known_file, out_file) + elif action == "dumplinkmap": + jsonfile = sys.argv[2] + dumpfile = sys.argv[3] + dump_link_map(jsonfile, dumpfile) + elif action in "--help": + print_usage(0) + else: + print_usage(1) diff --git a/checks/linkcheck/pkgs/default.nix b/checks/linkcheck/pkgs/default.nix new file mode 100644 index 0000000..ec85a5c --- /dev/null +++ b/checks/linkcheck/pkgs/default.nix @@ -0,0 +1,4 @@ +{ pkgs ? import { } }: +{ + wikiextractor = pkgs.callPackage ./wikiextractor.nix { }; +} diff --git a/checks/linkcheck/pkgs/wikiextractor.nix b/checks/linkcheck/pkgs/wikiextractor.nix new file mode 100644 index 0000000..ad58c0b --- /dev/null +++ b/checks/linkcheck/pkgs/wikiextractor.nix @@ -0,0 +1,44 @@ +{ lib +, python3 +, fetchpatch +, fetchFromGitHub +, +}: + +python3.pkgs.buildPythonApplication rec { + pname = "wikiextractor"; + version = "3.0.7"; + pyproject = true; + + src = fetchFromGitHub { + owner = "attardi"; + repo = "wikiextractor"; + rev = "v${version}"; + hash = "sha256-QeBC6ACHGKCSegd+wnOyIZI93L+f1EU62sFE0sAEwhU="; + }; + + build-system = [ + python3.pkgs.setuptools + python3.pkgs.wheel + ]; + + pythonImportsCheck = [ + "wikiextractor" + ]; + + patches = [ + # https://github.com/attardi/wikiextractor/issues/336#issuecomment-2322886454 + (fetchpatch { + url = "https://github.com/attardi/wikiextractor/commit/ab8988ebfa9e4557411f3d4c0f4ccda139e18875.patch"; + hash = "sha256-K1N6BA3FLieBTMIg9fyavc9ZajAr0vs754Nox53htmY="; + }) + ]; + + meta = { + description = "A tool for extracting plain text from Wikipedia dumps"; + homepage = "https://github.com/attardi/wikiextractor"; + license = lib.licenses.agpl3Only; + maintainers = with lib.maintainers; [ phanirithvij ]; + mainProgram = "wikiextractor"; + }; +} diff --git a/formatter.nix b/formatter.nix index 940a98b..5900b5e 100644 --- a/formatter.nix +++ b/formatter.nix @@ -29,32 +29,42 @@ ]; programs.shellcheck.enable = true; programs.deno.enable = true; + programs.black.enable = true; }; - packages.default = pkgs.mkShell { - packages = - let - convert2Tofu = - provider: - provider.override (prev: { - homepage = builtins.replaceStrings [ "registry.terraform.io/providers" ] [ - "registry.opentofu.org" + packages = { + default = pkgs.mkShell { + packages = + let + convert2Tofu = + provider: + provider.override (prev: { + homepage = builtins.replaceStrings [ "registry.terraform.io/providers" ] [ + "registry.opentofu.org" + ] + prev.homepage; + }); + in + [ + pkgs.bashInteractive + pkgs.sops + (pkgs.opentofu.withPlugins ( + p: + builtins.map convert2Tofu [ + p.hcloud + p.null + p.external + p.local ] - prev.homepage; - }); - in - [ - pkgs.bashInteractive - pkgs.sops - (pkgs.opentofu.withPlugins ( - p: - builtins.map convert2Tofu [ - p.hcloud - p.null - p.external - p.local - ] - )) - ]; + )) + ]; + }; + } + // (import ./checks/linkcheck/pkgs { inherit pkgs; }); + devShells.linkcheck = pkgs.mkShell { + packages = [ + pkgs.lychee + (pkgs.python3.withPackages (pypkgs: [ pypkgs.lxml ])) + ]; }; }; }