linkcheck: sort csv in python

This commit is contained in:
Jörg Thalheim 2024-09-17 10:37:30 +02:00
parent 2be4de6fc6
commit 7c9a68ff76
2 changed files with 32 additions and 33 deletions

View File

@ -81,13 +81,6 @@ timeout 30 lychee -E \
# csv of status, url, corresponding wiki page link
python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv
# sort for consistency
{
head -n 1 failed-wiki-links.csv
tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2
} >sorted_filename.tsv
mv sorted_filename.tsv failed-wiki-links.csv
cat failed-wiki-links.csv
dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"

View File

@ -3,6 +3,7 @@ import csv
import re
import sys
import argparse
import bisect
import xml.etree.ElementTree as ET
from pathlib import Path
@ -72,38 +73,43 @@ def badlinks_print(args: argparse.Namespace) -> None:
of.write(f"--exclude {stripped_line} ")
def read_lychee_file(lychee_file: Path) -> list[list[str]]:
fail_data = json.loads(lychee_file.read_text())
failed_urls = []
for xml_file, failed_url_entries in fail_data["fail_map"].items():
with open(xml_file, "r", encoding="utf-8") as xmlf:
root = ET.fromstring(f"<root>{xmlf.read()}</root>")
for doc in root.findall("doc"):
title = doc.attrib.get("title")
if title is None:
print(f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr)
continue
title = re.sub(r"\s+", "_", title)
content = doc.text
for entry in failed_url_entries:
url = entry["url"]
status = entry.get("status", {}).get("code", 403)
if url in content:
bisect.insort(
failed_urls,
[
status,
url,
f"https://wiki.nixos.org/wiki/{title}",
],
)
return failed_urls
def dump_link_map(args: argparse.Namespace) -> None:
fail_data = json.loads(args.json_file.read_text())
failed_urls = read_lychee_file(args.json_file)
with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file:
csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
for xml_file, failed_url_entries in fail_data["fail_map"].items():
with open(xml_file, "r", encoding="utf-8") as xmlf:
root = ET.fromstring(f"<root>{xmlf.read()}</root>")
for doc in root.findall("doc"):
title = doc.attrib.get("title")
if title is None:
print(
f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr
)
continue
title = re.sub(r"\s+", "_", title)
content = doc.text
for entry in failed_url_entries:
url = entry["url"]
status = entry.get("status", {}).get("code", 403)
if url in content:
csv_writer.writerow(
[
status,
url,
f"https://wiki.nixos.org/wiki/{title}",
]
)
for item in failed_urls:
csv_writer.writerow(item)
def main() -> None: