mirror of
https://github.com/Mic92/nixos-wiki-infra.git
synced 2024-11-21 15:53:34 +01:00
ci: add broken link checker action
Signed-off-by: phanirithvij <phanirithvij2000@gmail.com>
This commit is contained in:
parent
2e25f4e018
commit
0afcb72267
30
.github/workflows/check-dead-links.yml
vendored
Normal file
30
.github/workflows/check-dead-links.yml
vendored
Normal file
@ -0,0 +1,30 @@
|
||||
name: "check broken links in wiki"
|
||||
on:
|
||||
workflow_dispatch:
|
||||
# twice per month
|
||||
schedule:
|
||||
- cron: "0 14 1,15 * *"
|
||||
jobs:
|
||||
lychee:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- uses: actions/checkout@v4
|
||||
- uses: cachix/install-nix-action@v27
|
||||
with:
|
||||
nix_path: nixpkgs=channel:nixos-unstable
|
||||
- run: ./checks/linkcheck/lychee.sh
|
||||
env:
|
||||
GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
- name: upload lychee report as artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: lychee-report
|
||||
if-no-files-found: error
|
||||
path: ./checks/linkcheck/lychee*-report
|
||||
- name: upload filtered xmldump artifact
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: wikidump-filtered.xml
|
||||
if-no-files-found: error
|
||||
path: ./checks/linkcheck/workdir/wikidump-filtered.xml
|
||||
compression-level: 9
|
1
checks/linkcheck/.envrc
Normal file
1
checks/linkcheck/.envrc
Normal file
@ -0,0 +1 @@
|
||||
use flake .#linkcheck
|
5
checks/linkcheck/.gitignore
vendored
Normal file
5
checks/linkcheck/.gitignore
vendored
Normal file
@ -0,0 +1,5 @@
|
||||
temp
|
||||
.direnv
|
||||
*-report
|
||||
result*
|
||||
workdir
|
38
checks/linkcheck/README.md
Normal file
38
checks/linkcheck/README.md
Normal file
@ -0,0 +1,38 @@
|
||||
## What
|
||||
|
||||
wiki.nixos.org dead links checker gha powered by
|
||||
[lychee](https://github.com/lycheeverse/lychee)
|
||||
|
||||
Runs twice a week, can be adjusted in the github action cron job. Need to
|
||||
manually edit links in wiki, nothing automated.
|
||||
|
||||
Initial run gave ~100 results and were fixed manually, see the entries before 16
|
||||
Sep
|
||||
[here](https://wiki.nixos.org/w/index.php?title=Special:Contributions/Phanirithvij&target=Phanirithvij&offset=&limit=100).
|
||||
|
||||
## Why
|
||||
|
||||
Dead links if detected early have a chance to prevent linkrot.
|
||||
|
||||
- Why not use a broken-link-checker github action?
|
||||
- wrote this so that it is not tied in to gha (works locally)
|
||||
- gha will only call the script and upload artifact
|
||||
|
||||
## Instructions
|
||||
|
||||
```shell
|
||||
cd ./checks/linkcheck
|
||||
direnv allow # or # nix develop ..#linkcheck
|
||||
./lychee.sh
|
||||
```
|
||||
|
||||
It can be run from anywhere so `/path/to/checks/linkcheck/lychee.sh` works but
|
||||
the report will be generated at `/path/to/checks/linkcheck`.
|
||||
|
||||
As usual, `nix fmt` works inside linkcheck dir.
|
||||
|
||||
## TODO/Roadmap
|
||||
|
||||
- [ ] archive all links found in lychee scan (see lychee --dump)
|
||||
- Since these links are prone to deletion it is our duty to archive them.
|
||||
- There was a cli tool for this, forgot what it is, rediscover it
|
27
checks/linkcheck/allowed.links
Normal file
27
checks/linkcheck/allowed.links
Normal file
@ -0,0 +1,27 @@
|
||||
# an allowlist of known bad regexprs
|
||||
# each line can be a comment or a regex or a full/partial url
|
||||
# comments will be stripped out
|
||||
# urls must be urlencoded
|
||||
# stitched up into --exclude args per line and passed to lychee
|
||||
|
||||
(http://(server|unix|hydra)|https://(cache|relay-server)/)
|
||||
|
||||
# %7B is { matches urls inside nix expressions like ${version} etc.
|
||||
(.*\.(domain|tld|mydomain|local).*|my.app|%7B)
|
||||
|
||||
(qemu/1.0|locating-rules/1.0)
|
||||
|
||||
# no need to fetch cache extries
|
||||
(cache.nixos.org|fzakaria.cachix.org)
|
||||
|
||||
# urls in example snippets
|
||||
(USERNAME/nixpkgs|your_username/nixpkgs|fooUser/barRepo|code.visualstudio.com/sha|path/to/patch)
|
||||
|
||||
# works in browser
|
||||
https://www.phoronix.com/news/Mesa-Delete-Clover-Discussion
|
||||
|
||||
# works with git clone
|
||||
https://review.coreboot.org/coreboot.git
|
||||
|
||||
# works in browser
|
||||
https://pypi.org/project/stt/#files
|
99
checks/linkcheck/lychee.sh
Executable file
99
checks/linkcheck/lychee.sh
Executable file
@ -0,0 +1,99 @@
|
||||
#! /usr/bin/env nix-shell
|
||||
#! nix-shell -i bash -p wget p7zip bash findutils gnused coreutils lychee
|
||||
# shellcheck shell=bash
|
||||
|
||||
SCRIPT_DIR=$(cd -- "$(dirname -- "${BASH_SOURCE[0]}")" &>/dev/null && pwd)
|
||||
|
||||
workdir="$SCRIPT_DIR/workdir"
|
||||
mkdir -p "$workdir"
|
||||
pushd "$workdir" || exit
|
||||
|
||||
wget -O wikidump.xml.zst "https://wiki.nixos.org/wikidump.xml.zst"
|
||||
7z x -aoa wikidump.xml.zst
|
||||
|
||||
# filter unimportant pages like User:* Talk:*
|
||||
python ../main.py filter wikidump.xml wikidump-filtered.xml
|
||||
|
||||
# generate exclude args from allowlist
|
||||
python ../main.py badlinks ../allowed.links exclude-args
|
||||
|
||||
# exlude sending requests to the wiki
|
||||
echo "--exclude wiki.nixos.org/wiki" >>exclude-args
|
||||
extrargs="$extrargs --exclude wiki.nixos.org/wiki"
|
||||
excludeargs=$(cat exclude-args)
|
||||
|
||||
# extract only the text from the filtered xml dump
|
||||
nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml
|
||||
|
||||
# lychee requires .md or .html format files to parse
|
||||
find text -type f | grep -v .html | xargs -I{} mv {} "{}.html"
|
||||
|
||||
# default is too high
|
||||
extrargs="$extrargs --max-concurrency 16"
|
||||
|
||||
# github_token from env or fallback to gh (local dev)
|
||||
if [ -z "${GITHUB_TOKEN}" ]; then
|
||||
if command -v gh -v &>/dev/null; then
|
||||
echo using gh auth token
|
||||
GITHUB_TOKEN=$(gh auth token)
|
||||
fi
|
||||
fi
|
||||
|
||||
if [ -n "${GITHUB_TOKEN}" ]; then
|
||||
echo using github token
|
||||
extrargs="$extrargs --github-token $GITHUB_TOKEN"
|
||||
fi
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
# fetch links
|
||||
lychee -E \
|
||||
--cache --scheme http --scheme https \
|
||||
--include-verbatim $excludeargs $extrargs \
|
||||
text |
|
||||
tee lychee.log
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
# get all links ignoring the allowlist (allowed.links)
|
||||
lychee -E \
|
||||
--cache --scheme http --scheme https \
|
||||
--include-verbatim $extrargs \
|
||||
text |
|
||||
tee lychee-full.log
|
||||
|
||||
# shellcheck disable=SC2086
|
||||
# save fail_map so we can construct wiki link map to failed urls
|
||||
lychee -E \
|
||||
--cache --scheme http --scheme https \
|
||||
--include-verbatim $excludeargs $extrargs \
|
||||
--format json \
|
||||
text >lychee.json
|
||||
|
||||
# get archive suggestions
|
||||
# --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501
|
||||
# TODO remove timeout command later after the issue is fixed
|
||||
# shellcheck disable=SC2086
|
||||
timeout 30 lychee -E \
|
||||
--cache --scheme http --scheme https \
|
||||
--include-verbatim $excludeargs $extrargs \
|
||||
--suggest \
|
||||
text |
|
||||
tee lychee-wayback.log
|
||||
|
||||
# csv of status, url, corresponding wiki page link
|
||||
python ../main.py dumplinkmap lychee.json failed-wiki-links.csv
|
||||
|
||||
# sort for consistency
|
||||
{
|
||||
head -n 1 failed-wiki-links.csv
|
||||
tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2
|
||||
} >sorted_filename.tsv
|
||||
mv sorted_filename.tsv failed-wiki-links.csv
|
||||
|
||||
cat failed-wiki-links.csv
|
||||
|
||||
dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
|
||||
mkdir "$dest"
|
||||
cp ../allowed.links lychee*.log failed-wiki-links.csv "$dest"
|
||||
|
||||
popd || exit
|
||||
#rm -rf "$workdir"
|
113
checks/linkcheck/main.py
Normal file
113
checks/linkcheck/main.py
Normal file
@ -0,0 +1,113 @@
|
||||
import json
|
||||
import csv
|
||||
import re
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
|
||||
|
||||
# filter out unimportant pages like Talk:, User:, and old revisions of posts
|
||||
def process_dump(dump_file, out_file):
|
||||
tree = ET.parse(dump_file)
|
||||
root = tree.getroot()
|
||||
|
||||
ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"}
|
||||
ET.register_namespace("", ns["mw"])
|
||||
|
||||
for page in root.findall("mw:page", ns):
|
||||
title = page.find("mw:title", ns).text
|
||||
|
||||
if title.startswith("User:") or title.startswith("Talk:"):
|
||||
root.remove(page)
|
||||
continue
|
||||
|
||||
revisions = page.findall("mw:revision", ns)
|
||||
|
||||
if len(revisions) > 1:
|
||||
latest_revision = max(
|
||||
revisions, key=lambda rev: rev.find("mw:timestamp", ns).text
|
||||
)
|
||||
|
||||
# Remove all revisions except the latest one
|
||||
for revision in revisions:
|
||||
if revision != latest_revision:
|
||||
page.remove(revision)
|
||||
|
||||
tree.write(out_file, encoding="utf-8", xml_declaration=False)
|
||||
|
||||
|
||||
def badlinks_print(known_file, outfile):
|
||||
with open(known_file, "r") as infile, open(outfile, "w") as of:
|
||||
for line in infile:
|
||||
stripped_line = line.strip()
|
||||
if stripped_line and not stripped_line.startswith("#"):
|
||||
of.write(f"--exclude {stripped_line} ")
|
||||
|
||||
|
||||
def dump_link_map(jsonfile, dumpfile):
|
||||
with open(jsonfile, "r") as json_file:
|
||||
fail_data = json.load(json_file)
|
||||
|
||||
with open(dumpfile, mode="w", newline="", encoding="utf-8") as csv_file:
|
||||
csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
|
||||
csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
|
||||
|
||||
for xml_file, failed_url_entries in fail_data["fail_map"].items():
|
||||
with open(xml_file, "r", encoding="utf-8") as xmlf:
|
||||
root = ET.fromstring(f"<root>{xmlf.read()}</root>")
|
||||
|
||||
for doc in root.findall("doc"):
|
||||
title = doc.attrib.get("title")
|
||||
title = re.sub(r"\s+", "_", title)
|
||||
content = doc.text
|
||||
|
||||
for entry in failed_url_entries:
|
||||
url = entry["url"]
|
||||
status = entry.get("status", {}).get("code", 403)
|
||||
if url in content:
|
||||
csv_writer.writerow(
|
||||
[
|
||||
status,
|
||||
url,
|
||||
f"https://wiki.nixos.org/wiki/{title}",
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def print_usage(status=0):
|
||||
print(
|
||||
"""
|
||||
Usage: python main.py [action] <inputfile> <outfile>
|
||||
[action] <inputfile> <outfile> what?
|
||||
——————————————————————————————————————————————————————————
|
||||
filter dumpxmlfile outxmlfile filter out unncesscary pages from dump
|
||||
badlinks badlinksfile outfile parse and print known allowed.links
|
||||
dumplinkmap jsonfile outfilelinkmap dumps a map of url and nixos article where it is present
|
||||
help prints this help message and exits
|
||||
"""
|
||||
)
|
||||
sys.exit(status)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
if len(sys.argv) < 2:
|
||||
print_usage(1)
|
||||
action = sys.argv[1]
|
||||
if action in "filter|badlinks|dumplinkmap":
|
||||
if len(sys.argv) != 4:
|
||||
print_usage(1)
|
||||
if action == "filter":
|
||||
dump_file = sys.argv[2]
|
||||
out_file = sys.argv[3]
|
||||
process_dump(dump_file, out_file)
|
||||
elif action == "badlinks":
|
||||
known_file = sys.argv[2]
|
||||
out_file = sys.argv[3]
|
||||
badlinks_print(known_file, out_file)
|
||||
elif action == "dumplinkmap":
|
||||
jsonfile = sys.argv[2]
|
||||
dumpfile = sys.argv[3]
|
||||
dump_link_map(jsonfile, dumpfile)
|
||||
elif action in "--help":
|
||||
print_usage(0)
|
||||
else:
|
||||
print_usage(1)
|
4
checks/linkcheck/pkgs/default.nix
Normal file
4
checks/linkcheck/pkgs/default.nix
Normal file
@ -0,0 +1,4 @@
|
||||
{ pkgs ? import <nixpkgs> { } }:
|
||||
{
|
||||
wikiextractor = pkgs.callPackage ./wikiextractor.nix { };
|
||||
}
|
44
checks/linkcheck/pkgs/wikiextractor.nix
Normal file
44
checks/linkcheck/pkgs/wikiextractor.nix
Normal file
@ -0,0 +1,44 @@
|
||||
{ lib
|
||||
, python3
|
||||
, fetchpatch
|
||||
, fetchFromGitHub
|
||||
,
|
||||
}:
|
||||
|
||||
python3.pkgs.buildPythonApplication rec {
|
||||
pname = "wikiextractor";
|
||||
version = "3.0.7";
|
||||
pyproject = true;
|
||||
|
||||
src = fetchFromGitHub {
|
||||
owner = "attardi";
|
||||
repo = "wikiextractor";
|
||||
rev = "v${version}";
|
||||
hash = "sha256-QeBC6ACHGKCSegd+wnOyIZI93L+f1EU62sFE0sAEwhU=";
|
||||
};
|
||||
|
||||
build-system = [
|
||||
python3.pkgs.setuptools
|
||||
python3.pkgs.wheel
|
||||
];
|
||||
|
||||
pythonImportsCheck = [
|
||||
"wikiextractor"
|
||||
];
|
||||
|
||||
patches = [
|
||||
# https://github.com/attardi/wikiextractor/issues/336#issuecomment-2322886454
|
||||
(fetchpatch {
|
||||
url = "https://github.com/attardi/wikiextractor/commit/ab8988ebfa9e4557411f3d4c0f4ccda139e18875.patch";
|
||||
hash = "sha256-K1N6BA3FLieBTMIg9fyavc9ZajAr0vs754Nox53htmY=";
|
||||
})
|
||||
];
|
||||
|
||||
meta = {
|
||||
description = "A tool for extracting plain text from Wikipedia dumps";
|
||||
homepage = "https://github.com/attardi/wikiextractor";
|
||||
license = lib.licenses.agpl3Only;
|
||||
maintainers = with lib.maintainers; [ phanirithvij ];
|
||||
mainProgram = "wikiextractor";
|
||||
};
|
||||
}
|
@ -29,32 +29,42 @@
|
||||
];
|
||||
programs.shellcheck.enable = true;
|
||||
programs.deno.enable = true;
|
||||
programs.black.enable = true;
|
||||
};
|
||||
packages.default = pkgs.mkShell {
|
||||
packages =
|
||||
let
|
||||
convert2Tofu =
|
||||
provider:
|
||||
provider.override (prev: {
|
||||
homepage = builtins.replaceStrings [ "registry.terraform.io/providers" ] [
|
||||
"registry.opentofu.org"
|
||||
packages = {
|
||||
default = pkgs.mkShell {
|
||||
packages =
|
||||
let
|
||||
convert2Tofu =
|
||||
provider:
|
||||
provider.override (prev: {
|
||||
homepage = builtins.replaceStrings [ "registry.terraform.io/providers" ] [
|
||||
"registry.opentofu.org"
|
||||
]
|
||||
prev.homepage;
|
||||
});
|
||||
in
|
||||
[
|
||||
pkgs.bashInteractive
|
||||
pkgs.sops
|
||||
(pkgs.opentofu.withPlugins (
|
||||
p:
|
||||
builtins.map convert2Tofu [
|
||||
p.hcloud
|
||||
p.null
|
||||
p.external
|
||||
p.local
|
||||
]
|
||||
prev.homepage;
|
||||
});
|
||||
in
|
||||
[
|
||||
pkgs.bashInteractive
|
||||
pkgs.sops
|
||||
(pkgs.opentofu.withPlugins (
|
||||
p:
|
||||
builtins.map convert2Tofu [
|
||||
p.hcloud
|
||||
p.null
|
||||
p.external
|
||||
p.local
|
||||
]
|
||||
))
|
||||
];
|
||||
))
|
||||
];
|
||||
};
|
||||
}
|
||||
// (import ./checks/linkcheck/pkgs { inherit pkgs; });
|
||||
devShells.linkcheck = pkgs.mkShell {
|
||||
packages = [
|
||||
pkgs.lychee
|
||||
(pkgs.python3.withPackages (pypkgs: [ pypkgs.lxml ]))
|
||||
];
|
||||
};
|
||||
};
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user