Merge pull request #140 from phanirithvij/main

ci: add broken link checker action using lychee
2025-06-27 05:01:53 +02:00 · 2024-09-17 10:59:45 +02:00 · 2024-09-17 10:59:45 +02:00 · 75887b31fd
commit 75887b31fd
parent 26f837019e 01b85c1ee5
11 changed files with 410 additions and 24 deletions
--- a/.github/workflows/check-dead-links.yml
+++ b/.github/workflows/check-dead-links.yml
@ -0,0 +1,30 @@
 name: "check broken links in wiki"
 on:
  workflow_dispatch:
  # twice per month
  schedule:
    - cron: "0 14 1,15 * *"
 jobs:
  lychee:
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      - uses: cachix/install-nix-action@v27
        with:
          nix_path: nixpkgs=channel:nixos-unstable
      - run: ./checks/linkcheck/lychee.sh
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      - name: upload lychee report as artifact
        uses: actions/upload-artifact@v4
        with:
          name: lychee-report
          if-no-files-found: error
          path: ./checks/linkcheck/lychee*-report
      - name: upload filtered xmldump artifact
        uses: actions/upload-artifact@v4
        with:
          name: wikidump-filtered.xml
          if-no-files-found: error
          path: ./checks/linkcheck/workdir/wikidump-filtered.xml
          compression-level: 9
--- a/.gitignore
+++ b/.gitignore
@ -1,3 +1,4 @@
 /.envrc.private
 .terraform.lock.hcl
 **/.terraform
 .direnv
--- a/checks/linkcheck/.envrc
+++ b/checks/linkcheck/.envrc
@ -0,0 +1 @@
 use flake .#linkcheck
--- a/checks/linkcheck/.gitignore
+++ b/checks/linkcheck/.gitignore
@ -0,0 +1,5 @@
 temp
 .direnv
 *-report
 result*
 workdir
--- a/checks/linkcheck/README.md
+++ b/checks/linkcheck/README.md
@ -0,0 +1,35 @@
 ## What
 wiki.nixos.org dead links checker gha powered by
 [lychee](https://github.com/lycheeverse/lychee)
 Runs twice a week, can be adjusted in the github action cron job. Need to
 manually edit links in wiki, nothing automated.
 Initial run gave ~100 results and were fixed manually, see the entries before 16
 Sep
 [here](https://wiki.nixos.org/w/index.php?title=Special:Contributions/Phanirithvij&target=Phanirithvij&offset=&limit=100).
 ## Why
 Dead links if detected early have a chance to prevent linkrot.
 - Why not use a broken-link-checker github action?
  - wrote this so that it is not tied in to gha (works locally)
  - gha will only call the script and upload artifact
 ## Instructions
 ```shell
 ./checks/linkcheck/lychee.sh
 ```
 The report will be generated at `/path/to/checks/linkcheck`.
 As usual, `nix fmt` works inside linkcheck dir.
 ## TODO/Roadmap
 - [ ] archive all links found in lychee scan (see lychee --dump)
  - Since these links are prone to deletion it is our duty to archive them.
  - There was a cli tool for this, forgot what it is, rediscover it
--- a/checks/linkcheck/allowed.links
+++ b/checks/linkcheck/allowed.links
@ -0,0 +1,27 @@
 # an allowlist of known bad regexprs
 # each line can be a comment or a regex or a full/partial url
 # comments will be stripped out
 # urls must be urlencoded
 # stitched up into --exclude args per line and passed to lychee
 (http://(server|unix|hydra)|https://(cache|relay-server)/)
 # %7B is { matches urls inside nix expressions like ${version} etc.
 (.*\.(domain|tld|mydomain|local).*|my.app|%7B)
 (qemu/1.0|locating-rules/1.0)
 # no need to fetch cache extries
 (cache.nixos.org|fzakaria.cachix.org)
 # urls in example snippets
 (USERNAME/nixpkgs|your_username/nixpkgs|fooUser/barRepo|code.visualstudio.com/sha|path/to/patch)
 # works in browser
 https://www.phoronix.com/news/Mesa-Delete-Clover-Discussion
 # works with git clone
 https://review.coreboot.org/coreboot.git
 # works in browser
 https://pypi.org/project/stt/#files
--- a/checks/linkcheck/lychee.sh
+++ b/checks/linkcheck/lychee.sh
@ -0,0 +1,87 @@
 #! /usr/bin/env nix-shell
 #! nix-shell -i bash -p 'python3.withPackages (ps: with ps; [ lxml ])' curl zstd bash findutils gnused coreutils lychee
 # shellcheck shell=bash
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
 workdir="$SCRIPT_DIR/workdir"
 mkdir -p "$workdir"
 pushd "$workdir" || exit
 curl "https://wiki.nixos.org/wikidump.xml.zst" | zstd -d >wikidump.xml
 # filter unimportant pages like User:* Talk:*
 python3 ../main.py filter wikidump.xml wikidump-filtered.xml
 # generate exclude args from allowlist
 python3 ../main.py badlinks ../allowed.links exclude-args
 extrargs=(
  # exlude sending requests to the wiki
  "--exclude" "wiki.nixos.org/wiki"
  # default is too high
  "--max-concurrency" "16"
 )
 read -r -a excludeargs <<<"$(<exclude-args)"
 # extract only the text from the filtered xml dump
 nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml
 # lychee requires .md or .html format files to parse
 find text -type f ! -name "*.html" -print0 | xargs -0 -I{} mv {} "{}.html"
 # github_token from env or fallback to gh (local dev)
 if [ -z "${GITHUB_TOKEN}" ]; then
  if command -v gh -v &>/dev/null; then
    echo using gh auth token
    GITHUB_TOKEN=$(gh auth token)
  fi
 fi
 if [ -n "${GITHUB_TOKEN}" ]; then
  echo using github token
  extrargs+=("--github-token" "$GITHUB_TOKEN")
 fi
 # fetch links
 lychee -E \
  --cache --scheme http --scheme https \
  --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
  text |
  tee lychee.log
 # get all links ignoring the allowlist (allowed.links)
 lychee -E \
  --cache --scheme http --scheme https \
  --include-verbatim "${extrargs[@]}" \
  text |
  tee lychee-full.log
 # save fail_map so we can construct wiki link map to failed urls
 lychee -E \
  --cache --scheme http --scheme https \
  --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
  --format json \
  text >lychee.json
 # get archive suggestions
 # --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501
 # TODO remove timeout command later after the issue is fixed
 timeout 30 lychee -E \
  --cache --scheme http --scheme https \
  --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
  --suggest \
  text |
  tee lychee-wayback.log
 # csv of status, url, corresponding wiki page link
 python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv
 cat failed-wiki-links.csv
 dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
 mkdir -p "$dest"
 cp ../allowed.links lychee*.log failed-wiki-links.csv "$dest"
 popd || exit
 #rm -rf "$workdir"
--- a/checks/linkcheck/main.py
+++ b/checks/linkcheck/main.py
@ -0,0 +1,143 @@
 import json
 import csv
 import re
 import sys
 import argparse
 import bisect
 import xml.etree.ElementTree as ET
 from pathlib import Path
 def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
    timestamp = revision.find("mw:timestamp", ns)
    if timestamp is None:
        print(
            f"Timestamp tag not found in revision: {ET.tostring(revision)}",
            file=sys.stderr,
        )
        return ""
    if timestamp.text is None:
        print(
            f"Timestamp text doesn't exist in revision: {ET.tostring(revision)}",
            file=sys.stderr,
        )
        return ""
    return timestamp.text
 # filter out unimportant pages like Talk:, User:, and old revisions of posts
 def process_dump(args: argparse.Namespace) -> None:
    tree = ET.parse(str(args.dump_file))
    root = tree.getroot()
    ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"}
    ET.register_namespace("", ns["mw"])
    for page in root.findall("mw:page", ns):
        title_tag = page.find("mw:title", ns)
        if title_tag is None:
            print(f"Title tag not found in page: {ET.tostring(page)}", file=sys.stderr)
            continue
        title = title_tag.text
        if title is None:
            print(
                f"Title text doesn't exist in page: {ET.tostring(page)}",
                file=sys.stderr,
            )
            continue
        if title.startswith("User:") or title.startswith("Talk:"):
            root.remove(page)
            continue
        revisions = page.findall("mw:revision", ns)
        if len(revisions) > 1:
            latest_revision = max(
                revisions, key=lambda revison: get_revision_timestamp(revison, ns)
            )
            # Remove all revisions except the latest one
            for revision in revisions:
                if revision != latest_revision:
                    page.remove(revision)
    tree.write(str(args.out_file), encoding="utf-8", xml_declaration=False)
 def badlinks_print(args: argparse.Namespace) -> None:
    with args.known_file.open() as infile, args.out_file.open("w") as of:
        for line in infile:
            stripped_line = line.strip()
            if stripped_line and not stripped_line.startswith("#"):
                of.write(f"--exclude {stripped_line} ")
 def read_lychee_file(lychee_file: Path) -> list[list[str]]:
    fail_data = json.loads(lychee_file.read_text())
    failed_urls = []
    for xml_file, failed_url_entries in fail_data["fail_map"].items():
        with open(xml_file, "r", encoding="utf-8") as xmlf:
            root = ET.fromstring(f"<root>{xmlf.read()}</root>")
        for doc in root.findall("doc"):
            title = doc.attrib.get("title")
            if title is None:
                print(f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr)
                continue
            title = re.sub(r"\s+", "_", title)
            content = doc.text
            for entry in failed_url_entries:
                url = entry["url"]
                status = entry.get("status", {}).get("code", 403)
                if url in content:
                    bisect.insort(
                        failed_urls,
                        [
                            status,
                            url,
                            f"https://wiki.nixos.org/wiki/{title}",
                        ],
                    )
    return failed_urls
 def dump_link_map(args: argparse.Namespace) -> None:
    failed_urls = read_lychee_file(args.json_file)
    with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file:
        csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
        csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
        for item in failed_urls:
            csv_writer.writerow(item)
 def main() -> None:
    parser = argparse.ArgumentParser(description="Process wiki dump files")
    subparsers = parser.add_subparsers()
    parser_filter = subparsers.add_parser("filter", help="Filter out unimportant pages")
    parser_filter.add_argument("dump_file", type=Path)
    parser_filter.add_argument("out_file", type=Path)
    parser_filter.set_defaults(func=process_dump)
    parser_badlinks = subparsers.add_parser(
        "badlinks", help="Parse and print known allowed links"
    )
    parser_badlinks.add_argument("known_file", type=Path)
    parser_badlinks.add_argument("out_file", type=Path)
    parser_badlinks.set_defaults(func=badlinks_print)
    parser_dumplinkmap = subparsers.add_parser(
        "dumplinkmap", help="Dump a map of url and nixos article where it is present"
    )
    parser_dumplinkmap.add_argument("json_file", type=Path)
    parser_dumplinkmap.add_argument("dump_file", type=Path)
    parser_dumplinkmap.set_defaults(func=dump_link_map)
    args = parser.parse_args()
    args.func(args)
 if __name__ == "__main__":
    main()
--- a/checks/linkcheck/pkgs/default.nix
+++ b/checks/linkcheck/pkgs/default.nix
@ -0,0 +1,4 @@
 { pkgs ? import <nixpkgs> { } }:
 {
  wikiextractor = pkgs.callPackage ./wikiextractor.nix { };
 }
--- a/checks/linkcheck/pkgs/wikiextractor.nix
+++ b/checks/linkcheck/pkgs/wikiextractor.nix
@ -0,0 +1,44 @@
 { lib
 , python3
 , fetchpatch
 , fetchFromGitHub
 ,
 }:
 python3.pkgs.buildPythonApplication rec {
  pname = "wikiextractor";
  version = "3.0.7";
  pyproject = true;
  src = fetchFromGitHub {
    owner = "attardi";
    repo = "wikiextractor";
    rev = "v${version}";
    hash = "sha256-QeBC6ACHGKCSegd+wnOyIZI93L+f1EU62sFE0sAEwhU=";
  };
  build-system = [
    python3.pkgs.setuptools
    python3.pkgs.wheel
  ];
  pythonImportsCheck = [
    "wikiextractor"
  ];
  patches = [
    # https://github.com/attardi/wikiextractor/issues/336#issuecomment-2322886454
    (fetchpatch {
      url = "https://github.com/attardi/wikiextractor/commit/ab8988ebfa9e4557411f3d4c0f4ccda139e18875.patch";
      hash = "sha256-K1N6BA3FLieBTMIg9fyavc9ZajAr0vs754Nox53htmY=";
    })
  ];
  meta = {
    description = "A tool for extracting plain text from Wikipedia dumps";
    homepage = "https://github.com/attardi/wikiextractor";
    license = lib.licenses.agpl3Only;
    maintainers = with lib.maintainers; [ phanirithvij ];
    mainProgram = "wikiextractor";
  };
 }
--- a/formatter.nix
+++ b/formatter.nix
@ -30,7 +30,8 @@
        programs.shellcheck.enable = true;
        programs.deno.enable = true;
      };
-      packages.default = pkgs.mkShell {
+      packages = {
        default = pkgs.mkShell {
          packages =
            let
              convert2Tofu =
@ -56,5 +57,13 @@
              ))
            ];
        };
      }
      // (import ./checks/linkcheck/pkgs { inherit pkgs; });
      devShells.linkcheck = pkgs.mkShell {
        packages = [
          pkgs.lychee
          (pkgs.python3.withPackages (pypkgs: [ pypkgs.lxml ]))
        ];
      };
    };
 }