From 7c9a68ff76fd54b54f03277554439ee1da86a1e6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= <joerg@thalheim.io>
Date: Tue, 17 Sep 2024 10:37:30 +0200
Subject: [PATCH] linkcheck: sort csv in python

---
 checks/linkcheck/lychee.sh |  7 -----
 checks/linkcheck/main.py   | 58 +++++++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 33 deletions(-)
diff --git a/checks/linkcheck/lychee.sh b/checks/linkcheck/lychee.sh
index ac607e4..8d3aba0 100755
--- a/checks/linkcheck/lychee.sh
+++ b/checks/linkcheck/lychee.sh
@@ -81,13 +81,6 @@ timeout 30 lychee -E \
 # csv of status, url, corresponding wiki page link
 python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv
 
-# sort for consistency
-{
-  head -n 1 failed-wiki-links.csv
-  tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2
-} >sorted_filename.tsv
-mv sorted_filename.tsv failed-wiki-links.csv
-
 cat failed-wiki-links.csv
 
 dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py
index 5878a4b..02d0f64 100644
--- a/checks/linkcheck/main.py
+++ b/checks/linkcheck/main.py
@@ -3,6 +3,7 @@ import csv
 import re
 import sys
 import argparse
+import bisect
 import xml.etree.ElementTree as ET
 from pathlib import Path
 
@@ -72,38 +73,43 @@ def badlinks_print(args: argparse.Namespace) -> None:
                 of.write(f"--exclude {stripped_line} ")
 
 
+def read_lychee_file(lychee_file: Path) -> list[list[str]]:
+    fail_data = json.loads(lychee_file.read_text())
+    failed_urls = []
+    for xml_file, failed_url_entries in fail_data["fail_map"].items():
+        with open(xml_file, "r", encoding="utf-8") as xmlf:
+            root = ET.fromstring(f"<root>{xmlf.read()}</root>")
+        for doc in root.findall("doc"):
+            title = doc.attrib.get("title")
+            if title is None:
+                print(f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr)
+                continue
+            title = re.sub(r"\s+", "_", title)
+            content = doc.text
+            for entry in failed_url_entries:
+                url = entry["url"]
+                status = entry.get("status", {}).get("code", 403)
+                if url in content:
+                    bisect.insort(
+                        failed_urls,
+                        [
+                            status,
+                            url,
+                            f"https://wiki.nixos.org/wiki/{title}",
+                        ],
+                    )
+    return failed_urls
+
+
 def dump_link_map(args: argparse.Namespace) -> None:
-    fail_data = json.loads(args.json_file.read_text())
+    failed_urls = read_lychee_file(args.json_file)
 
     with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file:
         csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
         csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
 
-        for xml_file, failed_url_entries in fail_data["fail_map"].items():
-            with open(xml_file, "r", encoding="utf-8") as xmlf:
-                root = ET.fromstring(f"<root>{xmlf.read()}</root>")
-
-            for doc in root.findall("doc"):
-                title = doc.attrib.get("title")
-                if title is None:
-                    print(
-                        f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr
-                    )
-                    continue
-                title = re.sub(r"\s+", "_", title)
-                content = doc.text
-
-                for entry in failed_url_entries:
-                    url = entry["url"]
-                    status = entry.get("status", {}).get("code", 403)
-                    if url in content:
-                        csv_writer.writerow(
-                            [
-                                status,
-                                url,
-                                f"https://wiki.nixos.org/wiki/{title}",
-                            ]
-                        )
+        for item in failed_urls:
+            csv_writer.writerow(item)
 
 
 def main() -> None: