mirror of
https://github.com/Mic92/nixos-wiki-infra.git
synced 2025-06-27 05:01:53 +02:00
Merge pull request #140 from phanirithvij/main
ci: add broken link checker action using lychee
This commit is contained in:
commit
75887b31fd
30
.github/workflows/check-dead-links.yml
vendored
Normal file
30
.github/workflows/check-dead-links.yml
vendored
Normal file
@ -0,0 +1,30 @@
|
|||||||
|
name: "check broken links in wiki"
|
||||||
|
on:
|
||||||
|
workflow_dispatch:
|
||||||
|
# twice per month
|
||||||
|
schedule:
|
||||||
|
- cron: "0 14 1,15 * *"
|
||||||
|
jobs:
|
||||||
|
lychee:
|
||||||
|
runs-on: ubuntu-latest
|
||||||
|
steps:
|
||||||
|
- uses: actions/checkout@v4
|
||||||
|
- uses: cachix/install-nix-action@v27
|
||||||
|
with:
|
||||||
|
nix_path: nixpkgs=channel:nixos-unstable
|
||||||
|
- run: ./checks/linkcheck/lychee.sh
|
||||||
|
env:
|
||||||
|
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||||
|
- name: upload lychee report as artifact
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: lychee-report
|
||||||
|
if-no-files-found: error
|
||||||
|
path: ./checks/linkcheck/lychee*-report
|
||||||
|
- name: upload filtered xmldump artifact
|
||||||
|
uses: actions/upload-artifact@v4
|
||||||
|
with:
|
||||||
|
name: wikidump-filtered.xml
|
||||||
|
if-no-files-found: error
|
||||||
|
path: ./checks/linkcheck/workdir/wikidump-filtered.xml
|
||||||
|
compression-level: 9
|
1
.gitignore
vendored
1
.gitignore
vendored
@ -1,3 +1,4 @@
|
|||||||
/.envrc.private
|
/.envrc.private
|
||||||
.terraform.lock.hcl
|
.terraform.lock.hcl
|
||||||
**/.terraform
|
**/.terraform
|
||||||
|
.direnv
|
||||||
|
1
checks/linkcheck/.envrc
Normal file
1
checks/linkcheck/.envrc
Normal file
@ -0,0 +1 @@
|
|||||||
|
use flake .#linkcheck
|
5
checks/linkcheck/.gitignore
vendored
Normal file
5
checks/linkcheck/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
|||||||
|
temp
|
||||||
|
.direnv
|
||||||
|
*-report
|
||||||
|
result*
|
||||||
|
workdir
|
35
checks/linkcheck/README.md
Normal file
35
checks/linkcheck/README.md
Normal file
@ -0,0 +1,35 @@
|
|||||||
|
## What
|
||||||
|
|
||||||
|
wiki.nixos.org dead links checker gha powered by
|
||||||
|
[lychee](https://github.com/lycheeverse/lychee)
|
||||||
|
|
||||||
|
Runs twice a week, can be adjusted in the github action cron job. Need to
|
||||||
|
manually edit links in wiki, nothing automated.
|
||||||
|
|
||||||
|
Initial run gave ~100 results and were fixed manually, see the entries before 16
|
||||||
|
Sep
|
||||||
|
[here](https://wiki.nixos.org/w/index.php?title=Special:Contributions/Phanirithvij&target=Phanirithvij&offset=&limit=100).
|
||||||
|
|
||||||
|
## Why
|
||||||
|
|
||||||
|
Dead links if detected early have a chance to prevent linkrot.
|
||||||
|
|
||||||
|
- Why not use a broken-link-checker github action?
|
||||||
|
- wrote this so that it is not tied in to gha (works locally)
|
||||||
|
- gha will only call the script and upload artifact
|
||||||
|
|
||||||
|
## Instructions
|
||||||
|
|
||||||
|
```shell
|
||||||
|
./checks/linkcheck/lychee.sh
|
||||||
|
```
|
||||||
|
|
||||||
|
The report will be generated at `/path/to/checks/linkcheck`.
|
||||||
|
|
||||||
|
As usual, `nix fmt` works inside linkcheck dir.
|
||||||
|
|
||||||
|
## TODO/Roadmap
|
||||||
|
|
||||||
|
- [ ] archive all links found in lychee scan (see lychee --dump)
|
||||||
|
- Since these links are prone to deletion it is our duty to archive them.
|
||||||
|
- There was a cli tool for this, forgot what it is, rediscover it
|
27
checks/linkcheck/allowed.links
Normal file
27
checks/linkcheck/allowed.links
Normal file
@ -0,0 +1,27 @@
|
|||||||
|
# an allowlist of known bad regexprs
|
||||||
|
# each line can be a comment or a regex or a full/partial url
|
||||||
|
# comments will be stripped out
|
||||||
|
# urls must be urlencoded
|
||||||
|
# stitched up into --exclude args per line and passed to lychee
|
||||||
|
|
||||||
|
(http://(server|unix|hydra)|https://(cache|relay-server)/)
|
||||||
|
|
||||||
|
# %7B is { matches urls inside nix expressions like ${version} etc.
|
||||||
|
(.*\.(domain|tld|mydomain|local).*|my.app|%7B)
|
||||||
|
|
||||||
|
(qemu/1.0|locating-rules/1.0)
|
||||||
|
|
||||||
|
# no need to fetch cache extries
|
||||||
|
(cache.nixos.org|fzakaria.cachix.org)
|
||||||
|
|
||||||
|
# urls in example snippets
|
||||||
|
(USERNAME/nixpkgs|your_username/nixpkgs|fooUser/barRepo|code.visualstudio.com/sha|path/to/patch)
|
||||||
|
|
||||||
|
# works in browser
|
||||||
|
https://www.phoronix.com/news/Mesa-Delete-Clover-Discussion
|
||||||
|
|
||||||
|
# works with git clone
|
||||||
|
https://review.coreboot.org/coreboot.git
|
||||||
|
|
||||||
|
# works in browser
|
||||||
|
https://pypi.org/project/stt/#files
|
87
checks/linkcheck/lychee.sh
Executable file
87
checks/linkcheck/lychee.sh
Executable file
@ -0,0 +1,87 @@
|
|||||||
|
#! /usr/bin/env nix-shell
|
||||||
|
#! nix-shell -i bash -p 'python3.withPackages (ps: with ps; [ lxml ])' curl zstd bash findutils gnused coreutils lychee
|
||||||
|
# shellcheck shell=bash
|
||||||
|
|
||||||
|
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||||
|
|
||||||
|
workdir="$SCRIPT_DIR/workdir"
|
||||||
|
mkdir -p "$workdir"
|
||||||
|
pushd "$workdir" || exit
|
||||||
|
|
||||||
|
curl "https://wiki.nixos.org/wikidump.xml.zst" | zstd -d >wikidump.xml
|
||||||
|
|
||||||
|
# filter unimportant pages like User:* Talk:*
|
||||||
|
python3 ../main.py filter wikidump.xml wikidump-filtered.xml
|
||||||
|
|
||||||
|
# generate exclude args from allowlist
|
||||||
|
python3 ../main.py badlinks ../allowed.links exclude-args
|
||||||
|
|
||||||
|
extrargs=(
|
||||||
|
# exlude sending requests to the wiki
|
||||||
|
"--exclude" "wiki.nixos.org/wiki"
|
||||||
|
# default is too high
|
||||||
|
"--max-concurrency" "16"
|
||||||
|
)
|
||||||
|
read -r -a excludeargs <<<"$(<exclude-args)"
|
||||||
|
|
||||||
|
# extract only the text from the filtered xml dump
|
||||||
|
nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml
|
||||||
|
|
||||||
|
# lychee requires .md or .html format files to parse
|
||||||
|
find text -type f ! -name "*.html" -print0 | xargs -0 -I{} mv {} "{}.html"
|
||||||
|
|
||||||
|
# github_token from env or fallback to gh (local dev)
|
||||||
|
if [ -z "${GITHUB_TOKEN}" ]; then
|
||||||
|
if command -v gh -v &>/dev/null; then
|
||||||
|
echo using gh auth token
|
||||||
|
GITHUB_TOKEN=$(gh auth token)
|
||||||
|
fi
|
||||||
|
fi
|
||||||
|
|
||||||
|
if [ -n "${GITHUB_TOKEN}" ]; then
|
||||||
|
echo using github token
|
||||||
|
extrargs+=("--github-token" "$GITHUB_TOKEN")
|
||||||
|
fi
|
||||||
|
|
||||||
|
# fetch links
|
||||||
|
lychee -E \
|
||||||
|
--cache --scheme http --scheme https \
|
||||||
|
--include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
|
||||||
|
text |
|
||||||
|
tee lychee.log
|
||||||
|
|
||||||
|
# get all links ignoring the allowlist (allowed.links)
|
||||||
|
lychee -E \
|
||||||
|
--cache --scheme http --scheme https \
|
||||||
|
--include-verbatim "${extrargs[@]}" \
|
||||||
|
text |
|
||||||
|
tee lychee-full.log
|
||||||
|
|
||||||
|
# save fail_map so we can construct wiki link map to failed urls
|
||||||
|
lychee -E \
|
||||||
|
--cache --scheme http --scheme https \
|
||||||
|
--include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
|
||||||
|
--format json \
|
||||||
|
text >lychee.json
|
||||||
|
|
||||||
|
# get archive suggestions
|
||||||
|
# --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501
|
||||||
|
# TODO remove timeout command later after the issue is fixed
|
||||||
|
timeout 30 lychee -E \
|
||||||
|
--cache --scheme http --scheme https \
|
||||||
|
--include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
|
||||||
|
--suggest \
|
||||||
|
text |
|
||||||
|
tee lychee-wayback.log
|
||||||
|
|
||||||
|
# csv of status, url, corresponding wiki page link
|
||||||
|
python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv
|
||||||
|
|
||||||
|
cat failed-wiki-links.csv
|
||||||
|
|
||||||
|
dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
|
||||||
|
mkdir -p "$dest"
|
||||||
|
cp ../allowed.links lychee*.log failed-wiki-links.csv "$dest"
|
||||||
|
|
||||||
|
popd || exit
|
||||||
|
#rm -rf "$workdir"
|
143
checks/linkcheck/main.py
Normal file
143
checks/linkcheck/main.py
Normal file
@ -0,0 +1,143 @@
|
|||||||
|
import json
|
||||||
|
import csv
|
||||||
|
import re
|
||||||
|
import sys
|
||||||
|
import argparse
|
||||||
|
import bisect
|
||||||
|
import xml.etree.ElementTree as ET
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
|
||||||
|
def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
|
||||||
|
timestamp = revision.find("mw:timestamp", ns)
|
||||||
|
if timestamp is None:
|
||||||
|
print(
|
||||||
|
f"Timestamp tag not found in revision: {ET.tostring(revision)}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
return ""
|
||||||
|
if timestamp.text is None:
|
||||||
|
print(
|
||||||
|
f"Timestamp text doesn't exist in revision: {ET.tostring(revision)}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
return ""
|
||||||
|
return timestamp.text
|
||||||
|
|
||||||
|
|
||||||
|
# filter out unimportant pages like Talk:, User:, and old revisions of posts
|
||||||
|
def process_dump(args: argparse.Namespace) -> None:
|
||||||
|
tree = ET.parse(str(args.dump_file))
|
||||||
|
root = tree.getroot()
|
||||||
|
|
||||||
|
ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"}
|
||||||
|
ET.register_namespace("", ns["mw"])
|
||||||
|
|
||||||
|
for page in root.findall("mw:page", ns):
|
||||||
|
title_tag = page.find("mw:title", ns)
|
||||||
|
if title_tag is None:
|
||||||
|
print(f"Title tag not found in page: {ET.tostring(page)}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
title = title_tag.text
|
||||||
|
if title is None:
|
||||||
|
print(
|
||||||
|
f"Title text doesn't exist in page: {ET.tostring(page)}",
|
||||||
|
file=sys.stderr,
|
||||||
|
)
|
||||||
|
continue
|
||||||
|
|
||||||
|
if title.startswith("User:") or title.startswith("Talk:"):
|
||||||
|
root.remove(page)
|
||||||
|
continue
|
||||||
|
|
||||||
|
revisions = page.findall("mw:revision", ns)
|
||||||
|
|
||||||
|
if len(revisions) > 1:
|
||||||
|
latest_revision = max(
|
||||||
|
revisions, key=lambda revison: get_revision_timestamp(revison, ns)
|
||||||
|
)
|
||||||
|
|
||||||
|
# Remove all revisions except the latest one
|
||||||
|
for revision in revisions:
|
||||||
|
if revision != latest_revision:
|
||||||
|
page.remove(revision)
|
||||||
|
|
||||||
|
tree.write(str(args.out_file), encoding="utf-8", xml_declaration=False)
|
||||||
|
|
||||||
|
|
||||||
|
def badlinks_print(args: argparse.Namespace) -> None:
|
||||||
|
with args.known_file.open() as infile, args.out_file.open("w") as of:
|
||||||
|
for line in infile:
|
||||||
|
stripped_line = line.strip()
|
||||||
|
if stripped_line and not stripped_line.startswith("#"):
|
||||||
|
of.write(f"--exclude {stripped_line} ")
|
||||||
|
|
||||||
|
|
||||||
|
def read_lychee_file(lychee_file: Path) -> list[list[str]]:
|
||||||
|
fail_data = json.loads(lychee_file.read_text())
|
||||||
|
failed_urls = []
|
||||||
|
for xml_file, failed_url_entries in fail_data["fail_map"].items():
|
||||||
|
with open(xml_file, "r", encoding="utf-8") as xmlf:
|
||||||
|
root = ET.fromstring(f"<root>{xmlf.read()}</root>")
|
||||||
|
for doc in root.findall("doc"):
|
||||||
|
title = doc.attrib.get("title")
|
||||||
|
if title is None:
|
||||||
|
print(f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
title = re.sub(r"\s+", "_", title)
|
||||||
|
content = doc.text
|
||||||
|
for entry in failed_url_entries:
|
||||||
|
url = entry["url"]
|
||||||
|
status = entry.get("status", {}).get("code", 403)
|
||||||
|
if url in content:
|
||||||
|
bisect.insort(
|
||||||
|
failed_urls,
|
||||||
|
[
|
||||||
|
status,
|
||||||
|
url,
|
||||||
|
f"https://wiki.nixos.org/wiki/{title}",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
return failed_urls
|
||||||
|
|
||||||
|
|
||||||
|
def dump_link_map(args: argparse.Namespace) -> None:
|
||||||
|
failed_urls = read_lychee_file(args.json_file)
|
||||||
|
|
||||||
|
with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file:
|
||||||
|
csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
|
||||||
|
csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
|
||||||
|
|
||||||
|
for item in failed_urls:
|
||||||
|
csv_writer.writerow(item)
|
||||||
|
|
||||||
|
|
||||||
|
def main() -> None:
|
||||||
|
parser = argparse.ArgumentParser(description="Process wiki dump files")
|
||||||
|
subparsers = parser.add_subparsers()
|
||||||
|
parser_filter = subparsers.add_parser("filter", help="Filter out unimportant pages")
|
||||||
|
|
||||||
|
parser_filter.add_argument("dump_file", type=Path)
|
||||||
|
parser_filter.add_argument("out_file", type=Path)
|
||||||
|
parser_filter.set_defaults(func=process_dump)
|
||||||
|
|
||||||
|
parser_badlinks = subparsers.add_parser(
|
||||||
|
"badlinks", help="Parse and print known allowed links"
|
||||||
|
)
|
||||||
|
parser_badlinks.add_argument("known_file", type=Path)
|
||||||
|
parser_badlinks.add_argument("out_file", type=Path)
|
||||||
|
parser_badlinks.set_defaults(func=badlinks_print)
|
||||||
|
|
||||||
|
parser_dumplinkmap = subparsers.add_parser(
|
||||||
|
"dumplinkmap", help="Dump a map of url and nixos article where it is present"
|
||||||
|
)
|
||||||
|
parser_dumplinkmap.add_argument("json_file", type=Path)
|
||||||
|
parser_dumplinkmap.add_argument("dump_file", type=Path)
|
||||||
|
parser_dumplinkmap.set_defaults(func=dump_link_map)
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
args.func(args)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
4
checks/linkcheck/pkgs/default.nix
Normal file
4
checks/linkcheck/pkgs/default.nix
Normal file
@ -0,0 +1,4 @@
|
|||||||
|
{ pkgs ? import <nixpkgs> { } }:
|
||||||
|
{
|
||||||
|
wikiextractor = pkgs.callPackage ./wikiextractor.nix { };
|
||||||
|
}
|
44
checks/linkcheck/pkgs/wikiextractor.nix
Normal file
44
checks/linkcheck/pkgs/wikiextractor.nix
Normal file
@ -0,0 +1,44 @@
|
|||||||
|
{ lib
|
||||||
|
, python3
|
||||||
|
, fetchpatch
|
||||||
|
, fetchFromGitHub
|
||||||
|
,
|
||||||
|
}:
|
||||||
|
|
||||||
|
python3.pkgs.buildPythonApplication rec {
|
||||||
|
pname = "wikiextractor";
|
||||||
|
version = "3.0.7";
|
||||||
|
pyproject = true;
|
||||||
|
|
||||||
|
src = fetchFromGitHub {
|
||||||
|
owner = "attardi";
|
||||||
|
repo = "wikiextractor";
|
||||||
|
rev = "v${version}";
|
||||||
|
hash = "sha256-QeBC6ACHGKCSegd+wnOyIZI93L+f1EU62sFE0sAEwhU=";
|
||||||
|
};
|
||||||
|
|
||||||
|
build-system = [
|
||||||
|
python3.pkgs.setuptools
|
||||||
|
python3.pkgs.wheel
|
||||||
|
];
|
||||||
|
|
||||||
|
pythonImportsCheck = [
|
||||||
|
"wikiextractor"
|
||||||
|
];
|
||||||
|
|
||||||
|
patches = [
|
||||||
|
# https://github.com/attardi/wikiextractor/issues/336#issuecomment-2322886454
|
||||||
|
(fetchpatch {
|
||||||
|
url = "https://github.com/attardi/wikiextractor/commit/ab8988ebfa9e4557411f3d4c0f4ccda139e18875.patch";
|
||||||
|
hash = "sha256-K1N6BA3FLieBTMIg9fyavc9ZajAr0vs754Nox53htmY=";
|
||||||
|
})
|
||||||
|
];
|
||||||
|
|
||||||
|
meta = {
|
||||||
|
description = "A tool for extracting plain text from Wikipedia dumps";
|
||||||
|
homepage = "https://github.com/attardi/wikiextractor";
|
||||||
|
license = lib.licenses.agpl3Only;
|
||||||
|
maintainers = with lib.maintainers; [ phanirithvij ];
|
||||||
|
mainProgram = "wikiextractor";
|
||||||
|
};
|
||||||
|
}
|
@ -30,7 +30,8 @@
|
|||||||
programs.shellcheck.enable = true;
|
programs.shellcheck.enable = true;
|
||||||
programs.deno.enable = true;
|
programs.deno.enable = true;
|
||||||
};
|
};
|
||||||
packages.default = pkgs.mkShell {
|
packages = {
|
||||||
|
default = pkgs.mkShell {
|
||||||
packages =
|
packages =
|
||||||
let
|
let
|
||||||
convert2Tofu =
|
convert2Tofu =
|
||||||
@ -56,5 +57,13 @@
|
|||||||
))
|
))
|
||||||
];
|
];
|
||||||
};
|
};
|
||||||
|
}
|
||||||
|
// (import ./checks/linkcheck/pkgs { inherit pkgs; });
|
||||||
|
devShells.linkcheck = pkgs.mkShell {
|
||||||
|
packages = [
|
||||||
|
pkgs.lychee
|
||||||
|
(pkgs.python3.withPackages (pypkgs: [ pypkgs.lxml ]))
|
||||||
|
];
|
||||||
|
};
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
Loading…
x
Reference in New Issue
Block a user