From 2e25f4e01854f2c3e2117715d31cb0c49f21a3ec Mon Sep 17 00:00:00 2001
From: phanirithvij <phanirithvij2000@gmail.com>
Date: Mon, 16 Sep 2024 08:57:55 +0530
Subject: [PATCH 01/15] gitignore .direnv

Signed-off-by: phanirithvij <phanirithvij2000@gmail.com>
---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index f2ea7e0..75c407a 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,3 +1,4 @@
 /.envrc.private
 .terraform.lock.hcl
 **/.terraform
+.direnv

From 0afcb722676807b47e6e5081d5e9cf23da57436a Mon Sep 17 00:00:00 2001
From: phanirithvij <phanirithvij2000@gmail.com>
Date: Mon, 16 Sep 2024 13:29:07 +0530
Subject: [PATCH 02/15] ci: add broken link checker action

Signed-off-by: phanirithvij <phanirithvij2000@gmail.com>
---
 .github/workflows/check-dead-links.yml  |  30 +++++++
 checks/linkcheck/.envrc                 |   1 +
 checks/linkcheck/.gitignore             |   5 ++
 checks/linkcheck/README.md              |  38 ++++++++
 checks/linkcheck/allowed.links          |  27 ++++++
 checks/linkcheck/lychee.sh              |  99 +++++++++++++++++++++
 checks/linkcheck/main.py                | 113 ++++++++++++++++++++++++
 checks/linkcheck/pkgs/default.nix       |   4 +
 checks/linkcheck/pkgs/wikiextractor.nix |  44 +++++++++
 formatter.nix                           |  58 +++++++-----
 10 files changed, 395 insertions(+), 24 deletions(-)
 create mode 100644 .github/workflows/check-dead-links.yml
 create mode 100644 checks/linkcheck/.envrc
 create mode 100644 checks/linkcheck/.gitignore
 create mode 100644 checks/linkcheck/README.md
 create mode 100644 checks/linkcheck/allowed.links
 create mode 100755 checks/linkcheck/lychee.sh
 create mode 100644 checks/linkcheck/main.py
 create mode 100644 checks/linkcheck/pkgs/default.nix
 create mode 100644 checks/linkcheck/pkgs/wikiextractor.nix

diff --git a/.github/workflows/check-dead-links.yml b/.github/workflows/check-dead-links.yml
new file mode 100644
index 0000000..7acb455
--- /dev/null
+++ b/.github/workflows/check-dead-links.yml
@@ -0,0 +1,30 @@
+name: "check broken links in wiki"
+on:
+  workflow_dispatch:
+  # twice per month
+  schedule:
+    - cron: "0 14 1,15 * *"
+jobs:
+  lychee:
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+      - uses: cachix/install-nix-action@v27
+        with:
+          nix_path: nixpkgs=channel:nixos-unstable
+      - run: ./checks/linkcheck/lychee.sh
+        env:
+          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+      - name: upload lychee report as artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: lychee-report
+          if-no-files-found: error
+          path: ./checks/linkcheck/lychee*-report
+      - name: upload filtered xmldump artifact
+        uses: actions/upload-artifact@v4
+        with:
+          name: wikidump-filtered.xml
+          if-no-files-found: error
+          path: ./checks/linkcheck/workdir/wikidump-filtered.xml
+          compression-level: 9
diff --git a/checks/linkcheck/.envrc b/checks/linkcheck/.envrc
new file mode 100644
index 0000000..48cb98d
--- /dev/null
+++ b/checks/linkcheck/.envrc
@@ -0,0 +1 @@
+use flake .#linkcheck
diff --git a/checks/linkcheck/.gitignore b/checks/linkcheck/.gitignore
new file mode 100644
index 0000000..6f6c2b4
--- /dev/null
+++ b/checks/linkcheck/.gitignore
@@ -0,0 +1,5 @@
+temp
+.direnv
+*-report
+result*
+workdir
diff --git a/checks/linkcheck/README.md b/checks/linkcheck/README.md
new file mode 100644
index 0000000..4a1cb35
--- /dev/null
+++ b/checks/linkcheck/README.md
@@ -0,0 +1,38 @@
+## What
+
+wiki.nixos.org dead links checker gha powered by
+[lychee](https://github.com/lycheeverse/lychee)
+
+Runs twice a week, can be adjusted in the github action cron job. Need to
+manually edit links in wiki, nothing automated.
+
+Initial run gave ~100 results and were fixed manually, see the entries before 16
+Sep
+[here](https://wiki.nixos.org/w/index.php?title=Special:Contributions/Phanirithvij&target=Phanirithvij&offset=&limit=100).
+
+## Why
+
+Dead links if detected early have a chance to prevent linkrot.
+
+- Why not use a broken-link-checker github action?
+  - wrote this so that it is not tied in to gha (works locally)
+  - gha will only call the script and upload artifact
+
+## Instructions
+
+```shell
+cd ./checks/linkcheck
+direnv allow # or # nix develop ..#linkcheck
+./lychee.sh
+```
+
+It can be run from anywhere so `/path/to/checks/linkcheck/lychee.sh` works but
+the report will be generated at `/path/to/checks/linkcheck`.
+
+As usual, `nix fmt` works inside linkcheck dir.
+
+## TODO/Roadmap
+
+- [ ] archive all links found in lychee scan (see lychee --dump)
+  - Since these links are prone to deletion it is our duty to archive them.
+  - There was a cli tool for this, forgot what it is, rediscover it
diff --git a/checks/linkcheck/allowed.links b/checks/linkcheck/allowed.links
new file mode 100644
index 0000000..bb1c731
--- /dev/null
+++ b/checks/linkcheck/allowed.links
@@ -0,0 +1,27 @@
+# an allowlist of known bad regexprs
+# each line can be a comment or a regex or a full/partial url
+# comments will be stripped out
+# urls must be urlencoded
+# stitched up into --exclude args per line and passed to lychee
+
+(http://(server|unix|hydra)|https://(cache|relay-server)/)
+
+# %7B is { matches urls inside nix expressions like ${version} etc.
+(.*\.(domain|tld|mydomain|local).*|my.app|%7B)
+
+(qemu/1.0|locating-rules/1.0)
+
+# no need to fetch cache extries
+(cache.nixos.org|fzakaria.cachix.org)
+
+# urls in example snippets
+(USERNAME/nixpkgs|your_username/nixpkgs|fooUser/barRepo|code.visualstudio.com/sha|path/to/patch)
+
+# works in browser
+https://www.phoronix.com/news/Mesa-Delete-Clover-Discussion
+
+# works with git clone
+https://review.coreboot.org/coreboot.git
+
+# works in browser
+https://pypi.org/project/stt/#files
diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh
new file mode 100755
index 0000000..eab8300
--- /dev/null
+++ b/checks/linkcheck/lychee.sh
@@ -0,0 +1,99 @@
+#! /usr/bin/env nix-shell
+#! nix-shell -i bash -p wget p7zip bash findutils gnused coreutils lychee
+# shellcheck shell=bash
+
+SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
+
+workdir="$SCRIPT_DIR/workdir"
+mkdir -p "$workdir"
+pushd "$workdir" || exit
+
+wget -O wikidump.xml.zst "https://wiki.nixos.org/wikidump.xml.zst"
+7z x -aoa wikidump.xml.zst
+
+# filter unimportant pages like User:* Talk:*
+python ../main.py filter wikidump.xml wikidump-filtered.xml
+
+# generate exclude args from allowlist
+python ../main.py badlinks ../allowed.links exclude-args
+
+# exlude sending requests to the wiki
+echo "--exclude wiki.nixos.org/wiki" >>exclude-args
+extrargs="$extrargs --exclude wiki.nixos.org/wiki"
+excludeargs=$(cat exclude-args)
+
+# extract only the text from the filtered xml dump
+nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml
+
+# lychee requires .md or .html format files to parse
+find text -type f | grep -v .html | xargs -I{} mv {} "{}.html"
+
+# default is too high
+extrargs="$extrargs --max-concurrency 16"
+
+# github_token from env or fallback to gh (local dev)
+if [ -z "${GITHUB_TOKEN}" ]; then
+  if command -v gh -v &>/dev/null; then
+    echo using gh auth token
+    GITHUB_TOKEN=$(gh auth token)
+  fi
+fi
+
+if [ -n "${GITHUB_TOKEN}" ]; then
+  echo using github token
+  extrargs="$extrargs --github-token $GITHUB_TOKEN"
+fi
+
+# shellcheck disable=SC2086
+# fetch links
+lychee -E \
+  --cache --scheme http --scheme https \
+  --include-verbatim $excludeargs $extrargs \
+  text |
+  tee lychee.log
+
+# shellcheck disable=SC2086
+# get all links ignoring the allowlist (allowed.links)
+lychee -E \
+  --cache --scheme http --scheme https \
+  --include-verbatim $extrargs \
+  text |
+  tee lychee-full.log
+
+# shellcheck disable=SC2086
+# save fail_map so we can construct wiki link map to failed urls
+lychee -E \
+  --cache --scheme http --scheme https \
+  --include-verbatim $excludeargs $extrargs \
+  --format json \
+  text >lychee.json
+
+# get archive suggestions
+# --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501
+# TODO remove timeout command later after the issue is fixed
+# shellcheck disable=SC2086
+timeout 30 lychee -E \
+  --cache --scheme http --scheme https \
+  --include-verbatim $excludeargs $extrargs \
+  --suggest \
+  text |
+  tee lychee-wayback.log
+
+# csv of status, url, corresponding wiki page link
+python ../main.py dumplinkmap lychee.json failed-wiki-links.csv
+
+# sort for consistency
+{
+  head -n 1 failed-wiki-links.csv
+  tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2
+} >sorted_filename.tsv
+mv sorted_filename.tsv failed-wiki-links.csv
+
+cat failed-wiki-links.csv
+
+dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
+mkdir "$dest"
+cp ../allowed.links lychee*.log failed-wiki-links.csv "$dest"
+
+popd || exit
+#rm -rf "$workdir"
diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py
new file mode 100644
index 0000000..b8128b0
--- /dev/null
+++ b/checks/linkcheck/main.py
@@ -0,0 +1,113 @@
+import json
+import csv
+import re
+import sys
+import xml.etree.ElementTree as ET
+
+
+# filter out unimportant pages like Talk:, User:, and old revisions of posts
+def process_dump(dump_file, out_file):
+    tree = ET.parse(dump_file)
+    root = tree.getroot()
+
+    ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"}
+    ET.register_namespace("", ns["mw"])
+
+    for page in root.findall("mw:page", ns):
+        title = page.find("mw:title", ns).text
+
+        if title.startswith("User:") or title.startswith("Talk:"):
+            root.remove(page)
+            continue
+
+        revisions = page.findall("mw:revision", ns)
+
+        if len(revisions) > 1:
+            latest_revision = max(
+                revisions, key=lambda rev: rev.find("mw:timestamp", ns).text
+            )
+
+            # Remove all revisions except the latest one
+            for revision in revisions:
+                if revision != latest_revision:
+                    page.remove(revision)
+
+    tree.write(out_file, encoding="utf-8", xml_declaration=False)
+
+
+def badlinks_print(known_file, outfile):
+    with open(known_file, "r") as infile, open(outfile, "w") as of:
+        for line in infile:
+            stripped_line = line.strip()
+            if stripped_line and not stripped_line.startswith("#"):
+                of.write(f"--exclude {stripped_line} ")
+
+
+def dump_link_map(jsonfile, dumpfile):
+    with open(jsonfile, "r") as json_file:
+        fail_data = json.load(json_file)
+
+    with open(dumpfile, mode="w", newline="", encoding="utf-8") as csv_file:
+        csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
+        csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
+
+        for xml_file, failed_url_entries in fail_data["fail_map"].items():
+            with open(xml_file, "r", encoding="utf-8") as xmlf:
+                root = ET.fromstring(f"<root>{xmlf.read()}</root>")
+
+            for doc in root.findall("doc"):
+                title = doc.attrib.get("title")
+                title = re.sub(r"\s+", "_", title)
+                content = doc.text
+
+                for entry in failed_url_entries:
+                    url = entry["url"]
+                    status = entry.get("status", {}).get("code", 403)
+                    if url in content:
+                        csv_writer.writerow(
+                            [
+                                status,
+                                url,
+                                f"https://wiki.nixos.org/wiki/{title}",
+                            ]
+                        )
+
+
+def print_usage(status=0):
+    print(
+        """
+Usage: python main.py [action] <inputfile> <outfile>
+  [action]     <inputfile>     <outfile>        what?
+  ——————————————————————————————————————————————————————————
+  filter       dumpxmlfile     outxmlfile       filter out unncesscary pages from dump
+  badlinks     badlinksfile    outfile          parse and print known allowed.links
+  dumplinkmap  jsonfile        outfilelinkmap   dumps a map of url and nixos article where it is present
+  help                                          prints this help message and exits
+"""
+    )
+    sys.exit(status)
+
+
+if __name__ == "__main__":
+    if len(sys.argv) < 2:
+        print_usage(1)
+    action = sys.argv[1]
+    if action in "filter|badlinks|dumplinkmap":
+        if len(sys.argv) != 4:
+            print_usage(1)
+    if action == "filter":
+        dump_file = sys.argv[2]
+        out_file = sys.argv[3]
+        process_dump(dump_file, out_file)
+    elif action == "badlinks":
+        known_file = sys.argv[2]
+        out_file = sys.argv[3]
+        badlinks_print(known_file, out_file)
+    elif action == "dumplinkmap":
+        jsonfile = sys.argv[2]
+        dumpfile = sys.argv[3]
+        dump_link_map(jsonfile, dumpfile)
+    elif action in "--help":
+        print_usage(0)
+    else:
+        print_usage(1)
diff --git a/checks/linkcheck/pkgs/default.nix b/checks/linkcheck/pkgs/default.nix
new file mode 100644
index 0000000..ec85a5c
--- /dev/null
+++ b/checks/linkcheck/pkgs/default.nix
@@ -0,0 +1,4 @@
+{ pkgs ? import <nixpkgs> { } }:
+{
+  wikiextractor = pkgs.callPackage ./wikiextractor.nix { };
+}
diff --git a/checks/linkcheck/pkgs/wikiextractor.nix b/checks/linkcheck/pkgs/wikiextractor.nix
new file mode 100644
index 0000000..ad58c0b
--- /dev/null
+++ b/checks/linkcheck/pkgs/wikiextractor.nix
@@ -0,0 +1,44 @@
+{ lib
+, python3
+, fetchpatch
+, fetchFromGitHub
+,
+}:
+
+python3.pkgs.buildPythonApplication rec {
+  pname = "wikiextractor";
+  version = "3.0.7";
+  pyproject = true;
+
+  src = fetchFromGitHub {
+    owner = "attardi";
+    repo = "wikiextractor";
+    rev = "v${version}";
+    hash = "sha256-QeBC6ACHGKCSegd+wnOyIZI93L+f1EU62sFE0sAEwhU=";
+  };
+
+  build-system = [
+    python3.pkgs.setuptools
+    python3.pkgs.wheel
+  ];
+
+  pythonImportsCheck = [
+    "wikiextractor"
+  ];
+
+  patches = [
+    # https://github.com/attardi/wikiextractor/issues/336#issuecomment-2322886454
+    (fetchpatch {
+      url = "https://github.com/attardi/wikiextractor/commit/ab8988ebfa9e4557411f3d4c0f4ccda139e18875.patch";
+      hash = "sha256-K1N6BA3FLieBTMIg9fyavc9ZajAr0vs754Nox53htmY=";
+    })
+  ];
+
+  meta = {
+    description = "A tool for extracting plain text from Wikipedia dumps";
+    homepage = "https://github.com/attardi/wikiextractor";
+    license = lib.licenses.agpl3Only;
+    maintainers = with lib.maintainers; [ phanirithvij ];
+    mainProgram = "wikiextractor";
+  };
+}
diff --git a/formatter.nix b/formatter.nix
index 940a98b..5900b5e 100644
--- a/formatter.nix
+++ b/formatter.nix
@@ -29,32 +29,42 @@
         ];
         programs.shellcheck.enable = true;
         programs.deno.enable = true;
+        programs.black.enable = true;
       };
-      packages.default = pkgs.mkShell {
-        packages =
-          let
-            convert2Tofu =
-              provider:
-              provider.override (prev: {
-                homepage = builtins.replaceStrings [ "registry.terraform.io/providers" ] [
-                  "registry.opentofu.org"
+      packages = {
+        default = pkgs.mkShell {
+          packages =
+            let
+              convert2Tofu =
+                provider:
+                provider.override (prev: {
+                  homepage = builtins.replaceStrings [ "registry.terraform.io/providers" ] [
+                    "registry.opentofu.org"
+                  ]
+                    prev.homepage;
+                });
+            in
+            [
+              pkgs.bashInteractive
+              pkgs.sops
+              (pkgs.opentofu.withPlugins (
+                p:
+                builtins.map convert2Tofu [
+                  p.hcloud
+                  p.null
+                  p.external
+                  p.local
                 ]
-                  prev.homepage;
-              });
-          in
-          [
-            pkgs.bashInteractive
-            pkgs.sops
-            (pkgs.opentofu.withPlugins (
-              p:
-              builtins.map convert2Tofu [
-                p.hcloud
-                p.null
-                p.external
-                p.local
-              ]
-            ))
-          ];
+              ))
+            ];
+        };
+      }
+      // (import ./checks/linkcheck/pkgs { inherit pkgs; });
+      devShells.linkcheck = pkgs.mkShell {
+        packages = [
+          pkgs.lychee
+          (pkgs.python3.withPackages (pypkgs: [ pypkgs.lxml ]))
+        ];
       };
     };
 }

From 4e0b102f88793b0976ca0064588cdc1f63e47239 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 09:53:41 +0200
Subject: [PATCH 03/15] linkcheck: replace wget/7z with curl/zstd

---
 checks/linkcheck/lychee.sh | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh
index eab8300..e96ae16 100755
--- a/checks/linkcheck/lychee.sh
+++ b/checks/linkcheck/lychee.sh
@@ -1,5 +1,5 @@
 #! /usr/bin/env nix-shell
-#! nix-shell -i bash -p wget p7zip bash findutils gnused coreutils lychee
+#! nix-shell -i bash -p curl zstd bash findutils gnused coreutils lychee
 # shellcheck shell=bash
 
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
@@ -8,8 +8,7 @@ workdir="$SCRIPT_DIR/workdir"
 mkdir -p "$workdir"
 pushd "$workdir" || exit
 
-wget -O wikidump.xml.zst "https://wiki.nixos.org/wikidump.xml.zst"
-7z x -aoa wikidump.xml.zst
+curl "https://wiki.nixos.org/wikidump.xml.zst" | zstd -d >wikidump.xml
 
 # filter unimportant pages like User:* Talk:*
 python ../main.py filter wikidump.xml wikidump-filtered.xml

From 81746a3ea83d81d30028c40bf13b16dc7619fa6d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 09:54:16 +0200
Subject: [PATCH 04/15] add missing python3 to nix shebang

---
 checks/linkcheck/lychee.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh
index e96ae16..6c63cd3 100755
--- a/checks/linkcheck/lychee.sh
+++ b/checks/linkcheck/lychee.sh
@@ -1,5 +1,5 @@
 #! /usr/bin/env nix-shell
-#! nix-shell -i bash -p curl zstd bash findutils gnused coreutils lychee
+#! nix-shell -i bash -p python3 curl zstd bash findutils gnused coreutils lychee
 # shellcheck shell=bash
 
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
@@ -11,10 +11,10 @@ pushd "$workdir" || exit
 curl "https://wiki.nixos.org/wikidump.xml.zst" | zstd -d >wikidump.xml
 
 # filter unimportant pages like User:* Talk:*
-python ../main.py filter wikidump.xml wikidump-filtered.xml
+python3 ../main.py filter wikidump.xml wikidump-filtered.xml
 
 # generate exclude args from allowlist
-python ../main.py badlinks ../allowed.links exclude-args
+python3 ../main.py badlinks ../allowed.links exclude-args
 
 # exlude sending requests to the wiki
 echo "--exclude wiki.nixos.org/wiki" >>exclude-args
@@ -79,7 +79,7 @@ timeout 30 lychee -E \
   tee lychee-wayback.log
 
 # csv of status, url, corresponding wiki page link
-python ../main.py dumplinkmap lychee.json failed-wiki-links.csv
+python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv
 
 # sort for consistency
 {

From 4f79bc4c707e17692874169677e502d7f2db39a8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 09:56:49 +0200
Subject: [PATCH 05/15] linkcheck: add type annotations

---
 checks/linkcheck/main.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py
index b8128b0..8dc5270 100644
--- a/checks/linkcheck/main.py
+++ b/checks/linkcheck/main.py
@@ -3,10 +3,11 @@ import csv
 import re
 import sys
 import xml.etree.ElementTree as ET
+from typing import NoReturn
 
 
 # filter out unimportant pages like Talk:, User:, and old revisions of posts
-def process_dump(dump_file, out_file):
+def process_dump(dump_file: str, out_file: str) -> None:
     tree = ET.parse(dump_file)
     root = tree.getroot()
 
@@ -35,7 +36,7 @@ def process_dump(dump_file, out_file):
     tree.write(out_file, encoding="utf-8", xml_declaration=False)
 
 
-def badlinks_print(known_file, outfile):
+def badlinks_print(known_file: str, outfile: str) -> None:
     with open(known_file, "r") as infile, open(outfile, "w") as of:
         for line in infile:
             stripped_line = line.strip()
@@ -43,7 +44,7 @@ def badlinks_print(known_file, outfile):
                 of.write(f"--exclude {stripped_line} ")
 
 
-def dump_link_map(jsonfile, dumpfile):
+def dump_link_map(jsonfile: str, dumpfile: str) -> None:
     with open(jsonfile, "r") as json_file:
         fail_data = json.load(json_file)
 
@@ -73,7 +74,7 @@ def dump_link_map(jsonfile, dumpfile):
                         )
 
 
-def print_usage(status=0):
+def print_usage(status: int = 0) -> NoReturn:
     print(
         """
 Usage: python main.py [action] <inputfile> <outfile>

From 2d5336d3edf015d4bd90567b4bfaf9a3cd11a1ab Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:02:39 +0200
Subject: [PATCH 06/15] linkcheck: make xml parsing more robust and fix types

---
 checks/linkcheck/main.py | 36 ++++++++++++++++++++++++++++++++++--
 1 file changed, 34 insertions(+), 2 deletions(-)

diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py
index 8dc5270..4d00f78 100644
--- a/checks/linkcheck/main.py
+++ b/checks/linkcheck/main.py
@@ -6,6 +6,23 @@ import xml.etree.ElementTree as ET
 from typing import NoReturn
 
 
+def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
+    timestamp = revision.find("mw:timestamp", ns)
+    if timestamp is None:
+        print(
+            f"Timestamp tag not found in revision: {ET.tostring(revision)}",
+            file=sys.stderr,
+        )
+        return ""
+    if timestamp.text is None:
+        print(
+            f"Timestamp text doesn't exist in revision: {ET.tostring(revision)}",
+            file=sys.stderr,
+        )
+        return ""
+    return timestamp.text
+
+
 # filter out unimportant pages like Talk:, User:, and old revisions of posts
 def process_dump(dump_file: str, out_file: str) -> None:
     tree = ET.parse(dump_file)
@@ -15,7 +32,17 @@ def process_dump(dump_file: str, out_file: str) -> None:
     ET.register_namespace("", ns["mw"])
 
     for page in root.findall("mw:page", ns):
-        title = page.find("mw:title", ns).text
+        title_tag = page.find("mw:title", ns)
+        if title_tag is None:
+            print(f"Title tag not found in page: {ET.tostring(page)}", file=sys.stderr)
+            continue
+        title = title_tag.text
+        if title is None:
+            print(
+                f"Title text doesn't exist in page: {ET.tostring(page)}",
+                file=sys.stderr,
+            )
+            continue
 
         if title.startswith("User:") or title.startswith("Talk:"):
             root.remove(page)
@@ -25,7 +52,7 @@ def process_dump(dump_file: str, out_file: str) -> None:
 
         if len(revisions) > 1:
             latest_revision = max(
-                revisions, key=lambda rev: rev.find("mw:timestamp", ns).text
+                revisions, key=lambda revison: get_revision_timestamp(revison, ns)
             )
 
             # Remove all revisions except the latest one
@@ -58,6 +85,11 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None:
 
             for doc in root.findall("doc"):
                 title = doc.attrib.get("title")
+                if title is None:
+                    print(
+                        f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr
+                    )
+                    continue
                 title = re.sub(r"\s+", "_", title)
                 content = doc.text
 

From f66c272fc2f78c62bd123b17703cb4fbc30c897f Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:03:56 +0200
Subject: [PATCH 07/15] linkcheck: move argparsing to main function

---
 checks/linkcheck/main.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py
index 4d00f78..a7227c6 100644
--- a/checks/linkcheck/main.py
+++ b/checks/linkcheck/main.py
@@ -121,7 +121,7 @@ Usage: python main.py [action] <inputfile> <outfile>
     sys.exit(status)
 
 
-if __name__ == "__main__":
+def main() -> None:
     if len(sys.argv) < 2:
         print_usage(1)
     action = sys.argv[1]
@@ -144,3 +144,7 @@ if __name__ == "__main__":
         print_usage(0)
     else:
         print_usage(1)
+
+
+if __name__ == "__main__":
+    main()

From 7d16671ce2eba0277d71df62bceeb0440dd8942d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:07:55 +0200
Subject: [PATCH 08/15] linkcheck: add argparse

---
 checks/linkcheck/main.py | 80 +++++++++++++++++-----------------------
 1 file changed, 34 insertions(+), 46 deletions(-)

diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py
index a7227c6..c949bce 100644
--- a/checks/linkcheck/main.py
+++ b/checks/linkcheck/main.py
@@ -2,8 +2,8 @@ import json
 import csv
 import re
 import sys
+import argparse
 import xml.etree.ElementTree as ET
-from typing import NoReturn
 
 
 def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
@@ -24,8 +24,8 @@ def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
 
 
 # filter out unimportant pages like Talk:, User:, and old revisions of posts
-def process_dump(dump_file: str, out_file: str) -> None:
-    tree = ET.parse(dump_file)
+def process_dump( args: argparse.Namespace) -> None:
+    tree = ET.parse(args.dump_file)
     root = tree.getroot()
 
     ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"}
@@ -60,22 +60,23 @@ def process_dump(dump_file: str, out_file: str) -> None:
                 if revision != latest_revision:
                     page.remove(revision)
 
-    tree.write(out_file, encoding="utf-8", xml_declaration=False)
+    tree.write(args.out_file, encoding="utf-8", xml_declaration=False)
 
 
-def badlinks_print(known_file: str, outfile: str) -> None:
-    with open(known_file, "r") as infile, open(outfile, "w") as of:
+def badlinks_print(args: argparse.Namespace) -> None:
+    # known_file: str, outfile: str) -> None:
+    with open(args.known_file, "r") as infile, open(args.outfile, "w") as of:
         for line in infile:
             stripped_line = line.strip()
             if stripped_line and not stripped_line.startswith("#"):
                 of.write(f"--exclude {stripped_line} ")
 
 
-def dump_link_map(jsonfile: str, dumpfile: str) -> None:
-    with open(jsonfile, "r") as json_file:
+def dump_link_map(args: argparse.Namespace) -> None:
+    with open(args.jsonfile, "r") as json_file:
         fail_data = json.load(json_file)
 
-    with open(dumpfile, mode="w", newline="", encoding="utf-8") as csv_file:
+    with open(args.dumpfile, mode="w", newline="", encoding="utf-8") as csv_file:
         csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
         csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
 
@@ -106,44 +107,31 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None:
                         )
 
 
-def print_usage(status: int = 0) -> NoReturn:
-    print(
-        """
-Usage: python main.py [action] <inputfile> <outfile>
-  [action]     <inputfile>     <outfile>        what?
-  ——————————————————————————————————————————————————————————
-  filter       dumpxmlfile     outxmlfile       filter out unncesscary pages from dump
-  badlinks     badlinksfile    outfile          parse and print known allowed.links
-  dumplinkmap  jsonfile        outfilelinkmap   dumps a map of url and nixos article where it is present
-  help                                          prints this help message and exits
-"""
-    )
-    sys.exit(status)
-
-
 def main() -> None:
-    if len(sys.argv) < 2:
-        print_usage(1)
-    action = sys.argv[1]
-    if action in "filter|badlinks|dumplinkmap":
-        if len(sys.argv) != 4:
-            print_usage(1)
-    if action == "filter":
-        dump_file = sys.argv[2]
-        out_file = sys.argv[3]
-        process_dump(dump_file, out_file)
-    elif action == "badlinks":
-        known_file = sys.argv[2]
-        out_file = sys.argv[3]
-        badlinks_print(known_file, out_file)
-    elif action == "dumplinkmap":
-        jsonfile = sys.argv[2]
-        dumpfile = sys.argv[3]
-        dump_link_map(jsonfile, dumpfile)
-    elif action in "--help":
-        print_usage(0)
-    else:
-        print_usage(1)
+    parser = argparse.ArgumentParser(description="Process wiki dump files")
+    subparsers = parser.add_subparsers()
+    parser_filter = subparsers.add_parser("filter", help="Filter out unimportant pages")
+
+    parser_filter.add_argument("dump_file", type=str)
+    parser_filter.add_argument("out_file", type=str)
+    parser_filter.set_defaults(func=process_dump)
+
+    parser_badlinks = subparsers.add_parser(
+        "badlinks", help="Parse and print known allowed links"
+    )
+    parser_badlinks.add_argument("known_file", type=str)
+    parser_badlinks.add_argument("out_file", type=str)
+    parser_badlinks.set_defaults(func=badlinks_print)
+
+    parser_dumplinkmap = subparsers.add_parser(
+        "dumplinkmap", help="Dump a map of url and nixos article where it is present"
+    )
+    parser_dumplinkmap.add_argument("jsonfile", type=str)
+    parser_dumplinkmap.add_argument("dumpfile", type=str)
+    parser_dumplinkmap.set_defaults(func=dump_link_map)
+
+    args = parser.parse_args()
+    args.func(args)
 
 
 if __name__ == "__main__":

From 197dc548645ede3715b139a74c63718e0f05da79 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:12:52 +0200
Subject: [PATCH 09/15] linkcheck: use pathlib

---
 checks/linkcheck/main.py | 27 +++++++++++++--------------
 1 file changed, 13 insertions(+), 14 deletions(-)

diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py
index c949bce..5878a4b 100644
--- a/checks/linkcheck/main.py
+++ b/checks/linkcheck/main.py
@@ -4,6 +4,7 @@ import re
 import sys
 import argparse
 import xml.etree.ElementTree as ET
+from pathlib import Path
 
 
 def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
@@ -24,8 +25,8 @@ def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
 
 
 # filter out unimportant pages like Talk:, User:, and old revisions of posts
-def process_dump( args: argparse.Namespace) -> None:
-    tree = ET.parse(args.dump_file)
+def process_dump(args: argparse.Namespace) -> None:
+    tree = ET.parse(str(args.dump_file))
     root = tree.getroot()
 
     ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"}
@@ -60,12 +61,11 @@ def process_dump( args: argparse.Namespace) -> None:
                 if revision != latest_revision:
                     page.remove(revision)
 
-    tree.write(args.out_file, encoding="utf-8", xml_declaration=False)
+    tree.write(str(args.out_file), encoding="utf-8", xml_declaration=False)
 
 
 def badlinks_print(args: argparse.Namespace) -> None:
-    # known_file: str, outfile: str) -> None:
-    with open(args.known_file, "r") as infile, open(args.outfile, "w") as of:
+    with args.known_file.open() as infile, args.out_file.open("w") as of:
         for line in infile:
             stripped_line = line.strip()
             if stripped_line and not stripped_line.startswith("#"):
@@ -73,10 +73,9 @@ def badlinks_print(args: argparse.Namespace) -> None:
 
 
 def dump_link_map(args: argparse.Namespace) -> None:
-    with open(args.jsonfile, "r") as json_file:
-        fail_data = json.load(json_file)
+    fail_data = json.loads(args.json_file.read_text())
 
-    with open(args.dumpfile, mode="w", newline="", encoding="utf-8") as csv_file:
+    with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file:
         csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
         csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
 
@@ -112,22 +111,22 @@ def main() -> None:
     subparsers = parser.add_subparsers()
     parser_filter = subparsers.add_parser("filter", help="Filter out unimportant pages")
 
-    parser_filter.add_argument("dump_file", type=str)
-    parser_filter.add_argument("out_file", type=str)
+    parser_filter.add_argument("dump_file", type=Path)
+    parser_filter.add_argument("out_file", type=Path)
     parser_filter.set_defaults(func=process_dump)
 
     parser_badlinks = subparsers.add_parser(
         "badlinks", help="Parse and print known allowed links"
     )
-    parser_badlinks.add_argument("known_file", type=str)
-    parser_badlinks.add_argument("out_file", type=str)
+    parser_badlinks.add_argument("known_file", type=Path)
+    parser_badlinks.add_argument("out_file", type=Path)
     parser_badlinks.set_defaults(func=badlinks_print)
 
     parser_dumplinkmap = subparsers.add_parser(
         "dumplinkmap", help="Dump a map of url and nixos article where it is present"
     )
-    parser_dumplinkmap.add_argument("jsonfile", type=str)
-    parser_dumplinkmap.add_argument("dumpfile", type=str)
+    parser_dumplinkmap.add_argument("json_file", type=Path)
+    parser_dumplinkmap.add_argument("dump_file", type=Path)
     parser_dumplinkmap.set_defaults(func=dump_link_map)
 
     args = parser.parse_args()

From 94429be77fb2ca7ddeab6ae0ab148124684845a0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:25:16 +0200
Subject: [PATCH 10/15] linkcheck: allow to re-run report

---
 checks/linkcheck/lychee.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh
index 6c63cd3..fea01ae 100755
--- a/checks/linkcheck/lychee.sh
+++ b/checks/linkcheck/lychee.sh
@@ -91,7 +91,7 @@ mv sorted_filename.tsv failed-wiki-links.csv
 cat failed-wiki-links.csv
 
 dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
-mkdir "$dest"
+mkdir -p "$dest"
 cp ../allowed.links lychee*.log failed-wiki-links.csv "$dest"
 
 popd || exit

From ff19131911dd30e20672c733aadd3d5516b68b46 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:27:21 +0200
Subject: [PATCH 11/15] linkcheck: add lxml to shebang

---
 checks/linkcheck/lychee.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh
index fea01ae..ac607e4 100755
--- a/checks/linkcheck/lychee.sh
+++ b/checks/linkcheck/lychee.sh
@@ -1,5 +1,5 @@
 #! /usr/bin/env nix-shell
-#! nix-shell -i bash -p python3 curl zstd bash findutils gnused coreutils lychee
+#! nix-shell -i bash -p 'python3.withPackages (ps: with ps; [ lxml ])' curl zstd bash findutils gnused coreutils lychee
 # shellcheck shell=bash
 
 SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)

From 2be4de6fc69d0f7ecd6bc52c040c283c44dccb64 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:28:10 +0200
Subject: [PATCH 12/15] linkcheck: simplify instructions

---
 checks/linkcheck/README.md | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/checks/linkcheck/README.md b/checks/linkcheck/README.md
index 4a1cb35..27cde74 100644
--- a/checks/linkcheck/README.md
+++ b/checks/linkcheck/README.md
@@ -21,13 +21,10 @@ Dead links if detected early have a chance to prevent linkrot.
 ## Instructions
 
 ```shell
-cd ./checks/linkcheck
-direnv allow # or # nix develop ..#linkcheck
-./lychee.sh
+./checks/linkcheck/lychee.sh
 ```
 
-It can be run from anywhere so `/path/to/checks/linkcheck/lychee.sh` works but
-the report will be generated at `/path/to/checks/linkcheck`.
+The report will be generated at `/path/to/checks/linkcheck`.
 
 As usual, `nix fmt` works inside linkcheck dir.
 

From 7c9a68ff76fd54b54f03277554439ee1da86a1e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:37:30 +0200
Subject: [PATCH 13/15] linkcheck: sort csv in python

---
 checks/linkcheck/lychee.sh |  7 -----
 checks/linkcheck/main.py   | 58 +++++++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 33 deletions(-)

diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh
index ac607e4..8d3aba0 100755
--- a/checks/linkcheck/lychee.sh
+++ b/checks/linkcheck/lychee.sh
@@ -81,13 +81,6 @@ timeout 30 lychee -E \
 # csv of status, url, corresponding wiki page link
 python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv
 
-# sort for consistency
-{
-  head -n 1 failed-wiki-links.csv
-  tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2
-} >sorted_filename.tsv
-mv sorted_filename.tsv failed-wiki-links.csv
-
 cat failed-wiki-links.csv
 
 dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py
index 5878a4b..02d0f64 100644
--- a/checks/linkcheck/main.py
+++ b/checks/linkcheck/main.py
@@ -3,6 +3,7 @@ import csv
 import re
 import sys
 import argparse
+import bisect
 import xml.etree.ElementTree as ET
 from pathlib import Path
 
@@ -72,38 +73,43 @@ def badlinks_print(args: argparse.Namespace) -> None:
                 of.write(f"--exclude {stripped_line} ")
 
 
+def read_lychee_file(lychee_file: Path) -> list[list[str]]:
+    fail_data = json.loads(lychee_file.read_text())
+    failed_urls = []
+    for xml_file, failed_url_entries in fail_data["fail_map"].items():
+        with open(xml_file, "r", encoding="utf-8") as xmlf:
+            root = ET.fromstring(f"<root>{xmlf.read()}</root>")
+        for doc in root.findall("doc"):
+            title = doc.attrib.get("title")
+            if title is None:
+                print(f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr)
+                continue
+            title = re.sub(r"\s+", "_", title)
+            content = doc.text
+            for entry in failed_url_entries:
+                url = entry["url"]
+                status = entry.get("status", {}).get("code", 403)
+                if url in content:
+                    bisect.insort(
+                        failed_urls,
+                        [
+                            status,
+                            url,
+                            f"https://wiki.nixos.org/wiki/{title}",
+                        ],
+                    )
+    return failed_urls
+
+
 def dump_link_map(args: argparse.Namespace) -> None:
-    fail_data = json.loads(args.json_file.read_text())
+    failed_urls = read_lychee_file(args.json_file)
 
     with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file:
         csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
         csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
 
-        for xml_file, failed_url_entries in fail_data["fail_map"].items():
-            with open(xml_file, "r", encoding="utf-8") as xmlf:
-                root = ET.fromstring(f"<root>{xmlf.read()}</root>")
-
-            for doc in root.findall("doc"):
-                title = doc.attrib.get("title")
-                if title is None:
-                    print(
-                        f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr
-                    )
-                    continue
-                title = re.sub(r"\s+", "_", title)
-                content = doc.text
-
-                for entry in failed_url_entries:
-                    url = entry["url"]
-                    status = entry.get("status", {}).get("code", 403)
-                    if url in content:
-                        csv_writer.writerow(
-                            [
-                                status,
-                                url,
-                                f"https://wiki.nixos.org/wiki/{title}",
-                            ]
-                        )
+        for item in failed_urls:
+            csv_writer.writerow(item)
 
 
 def main() -> None:

From 91b55c69428bfe8e906bff6f087ac48ff4cddf52 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:54:05 +0200
Subject: [PATCH 14/15] linkcheck: use array to pass args

---
 checks/linkcheck/lychee.sh | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh
index 8d3aba0..9fca97b 100755
--- a/checks/linkcheck/lychee.sh
+++ b/checks/linkcheck/lychee.sh
@@ -16,19 +16,19 @@ python3 ../main.py filter wikidump.xml wikidump-filtered.xml
 # generate exclude args from allowlist
 python3 ../main.py badlinks ../allowed.links exclude-args
 
-# exlude sending requests to the wiki
-echo "--exclude wiki.nixos.org/wiki" >>exclude-args
-extrargs="$extrargs --exclude wiki.nixos.org/wiki"
-excludeargs=$(cat exclude-args)
+extrargs=(
+  # exlude sending requests to the wiki
+  "--exclude" "wiki.nixos.org/wiki"
+  # default is too high
+  "--max-concurrency" "16"
+)
+read -r -a excludeargs <<<"$(<exclude-args)"
 
 # extract only the text from the filtered xml dump
 nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml
 
 # lychee requires .md or .html format files to parse
-find text -type f | grep -v .html | xargs -I{} mv {} "{}.html"
-
-# default is too high
-extrargs="$extrargs --max-concurrency 16"
+find text -type f ! -name "*.html" -print0 | xargs -0 -I{} mv {} "{}.html"
 
 # github_token from env or fallback to gh (local dev)
 if [ -z "${GITHUB_TOKEN}" ]; then
@@ -40,40 +40,36 @@ fi
 
 if [ -n "${GITHUB_TOKEN}" ]; then
   echo using github token
-  extrargs="$extrargs --github-token $GITHUB_TOKEN"
+  extrargs+=("--github-token" "$GITHUB_TOKEN")
 fi
 
-# shellcheck disable=SC2086
 # fetch links
 lychee -E \
   --cache --scheme http --scheme https \
-  --include-verbatim $excludeargs $extrargs \
+  --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
   text |
   tee lychee.log
 
-# shellcheck disable=SC2086
 # get all links ignoring the allowlist (allowed.links)
 lychee -E \
   --cache --scheme http --scheme https \
-  --include-verbatim $extrargs \
+  --include-verbatim "${extrargs[@]}" \
   text |
   tee lychee-full.log
 
-# shellcheck disable=SC2086
 # save fail_map so we can construct wiki link map to failed urls
 lychee -E \
   --cache --scheme http --scheme https \
-  --include-verbatim $excludeargs $extrargs \
+  --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
   --format json \
   text >lychee.json
 
 # get archive suggestions
 # --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501
 # TODO remove timeout command later after the issue is fixed
-# shellcheck disable=SC2086
 timeout 30 lychee -E \
   --cache --scheme http --scheme https \
-  --include-verbatim $excludeargs $extrargs \
+  --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
   --suggest \
   text |
   tee lychee-wayback.log

From 01b85c1ee5fdb67b1aa4ab344ac3d15d2f834144 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:55:17 +0200
Subject: [PATCH 15/15] drop black as formatter since we already use ruff

---
 formatter.nix | 1 -
 1 file changed, 1 deletion(-)

diff --git a/formatter.nix b/formatter.nix
index 5900b5e..319e4e0 100644
--- a/formatter.nix
+++ b/formatter.nix
@@ -29,7 +29,6 @@
         ];
         programs.shellcheck.enable = true;
         programs.deno.enable = true;
-        programs.black.enable = true;
       };
       packages = {
         default = pkgs.mkShell {