linkcheck: use array to pass args

This commit is contained in:
Jörg Thalheim 2024-09-17 10:54:05 +02:00
parent 7c9a68ff76
commit 91b55c6942

View File

@ -16,19 +16,19 @@ python3 ../main.py filter wikidump.xml wikidump-filtered.xml
# generate exclude args from allowlist # generate exclude args from allowlist
python3 ../main.py badlinks ../allowed.links exclude-args python3 ../main.py badlinks ../allowed.links exclude-args
# exlude sending requests to the wiki extrargs=(
echo "--exclude wiki.nixos.org/wiki" >>exclude-args # exlude sending requests to the wiki
extrargs="$extrargs --exclude wiki.nixos.org/wiki" "--exclude" "wiki.nixos.org/wiki"
excludeargs=$(cat exclude-args) # default is too high
"--max-concurrency" "16"
)
read -r -a excludeargs <<<"$(<exclude-args)"
# extract only the text from the filtered xml dump # extract only the text from the filtered xml dump
nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml nix --extra-experimental-features "nix-command flakes" run ..#wikiextractor wikidump-filtered.xml
# lychee requires .md or .html format files to parse # lychee requires .md or .html format files to parse
find text -type f | grep -v .html | xargs -I{} mv {} "{}.html" find text -type f ! -name "*.html" -print0 | xargs -0 -I{} mv {} "{}.html"
# default is too high
extrargs="$extrargs --max-concurrency 16"
# github_token from env or fallback to gh (local dev) # github_token from env or fallback to gh (local dev)
if [ -z "${GITHUB_TOKEN}" ]; then if [ -z "${GITHUB_TOKEN}" ]; then
@ -40,40 +40,36 @@ fi
if [ -n "${GITHUB_TOKEN}" ]; then if [ -n "${GITHUB_TOKEN}" ]; then
echo using github token echo using github token
extrargs="$extrargs --github-token $GITHUB_TOKEN" extrargs+=("--github-token" "$GITHUB_TOKEN")
fi fi
# shellcheck disable=SC2086
# fetch links # fetch links
lychee -E \ lychee -E \
--cache --scheme http --scheme https \ --cache --scheme http --scheme https \
--include-verbatim $excludeargs $extrargs \ --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
text | text |
tee lychee.log tee lychee.log
# shellcheck disable=SC2086
# get all links ignoring the allowlist (allowed.links) # get all links ignoring the allowlist (allowed.links)
lychee -E \ lychee -E \
--cache --scheme http --scheme https \ --cache --scheme http --scheme https \
--include-verbatim $extrargs \ --include-verbatim "${extrargs[@]}" \
text | text |
tee lychee-full.log tee lychee-full.log
# shellcheck disable=SC2086
# save fail_map so we can construct wiki link map to failed urls # save fail_map so we can construct wiki link map to failed urls
lychee -E \ lychee -E \
--cache --scheme http --scheme https \ --cache --scheme http --scheme https \
--include-verbatim $excludeargs $extrargs \ --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
--format json \ --format json \
text >lychee.json text >lychee.json
# get archive suggestions # get archive suggestions
# --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501 # --timeout not working with --suggest see https://github.com/lycheeverse/lychee/issues/1501
# TODO remove timeout command later after the issue is fixed # TODO remove timeout command later after the issue is fixed
# shellcheck disable=SC2086
timeout 30 lychee -E \ timeout 30 lychee -E \
--cache --scheme http --scheme https \ --cache --scheme http --scheme https \
--include-verbatim $excludeargs $extrargs \ --include-verbatim "${excludeargs[@]}" "${extrargs[@]}" \
--suggest \ --suggest \
text | text |
tee lychee-wayback.log tee lychee-wayback.log