From 7c9a68ff76fd54b54f03277554439ee1da86a1e6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:37:30 +0200 Subject: [PATCH] linkcheck: sort csv in python --- checks/linkcheck/lychee.sh | 7 ----- checks/linkcheck/main.py | 58 +++++++++++++++++++++----------------- 2 files changed, 32 insertions(+), 33 deletions(-) diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh index ac607e4..8d3aba0 100755 --- a/checks/linkcheck/lychee.sh +++ b/checks/linkcheck/lychee.sh @@ -81,13 +81,6 @@ timeout 30 lychee -E \ # csv of status, url, corresponding wiki page link python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv -# sort for consistency -{ - head -n 1 failed-wiki-links.csv - tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2 -} >sorted_filename.tsv -mv sorted_filename.tsv failed-wiki-links.csv - cat failed-wiki-links.csv dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report" diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index 5878a4b..02d0f64 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -3,6 +3,7 @@ import csv import re import sys import argparse +import bisect import xml.etree.ElementTree as ET from pathlib import Path @@ -72,38 +73,43 @@ def badlinks_print(args: argparse.Namespace) -> None: of.write(f"--exclude {stripped_line} ") +def read_lychee_file(lychee_file: Path) -> list[list[str]]: + fail_data = json.loads(lychee_file.read_text()) + failed_urls = [] + for xml_file, failed_url_entries in fail_data["fail_map"].items(): + with open(xml_file, "r", encoding="utf-8") as xmlf: + root = ET.fromstring(f"{xmlf.read()}") + for doc in root.findall("doc"): + title = doc.attrib.get("title") + if title is None: + print(f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr) + continue + title = re.sub(r"\s+", "_", title) + content = doc.text + for entry in failed_url_entries: + url = entry["url"] + status = entry.get("status", {}).get("code", 403) + if url in content: + bisect.insort( + failed_urls, + [ + status, + url, + f"https://wiki.nixos.org/wiki/{title}", + ], + ) + return failed_urls + + def dump_link_map(args: argparse.Namespace) -> None: - fail_data = json.loads(args.json_file.read_text()) + failed_urls = read_lychee_file(args.json_file) with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file: csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"') csv_writer.writerow(["STATUS", "URL", "WIKIURL"]) - for xml_file, failed_url_entries in fail_data["fail_map"].items(): - with open(xml_file, "r", encoding="utf-8") as xmlf: - root = ET.fromstring(f"{xmlf.read()}") - - for doc in root.findall("doc"): - title = doc.attrib.get("title") - if title is None: - print( - f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr - ) - continue - title = re.sub(r"\s+", "_", title) - content = doc.text - - for entry in failed_url_entries: - url = entry["url"] - status = entry.get("status", {}).get("code", 403) - if url in content: - csv_writer.writerow( - [ - status, - url, - f"https://wiki.nixos.org/wiki/{title}", - ] - ) + for item in failed_urls: + csv_writer.writerow(item) def main() -> None: