From 2d5336d3edf015d4bd90567b4bfaf9a3cd11a1ab Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?J=C3=B6rg=20Thalheim?= Date: Tue, 17 Sep 2024 10:02:39 +0200 Subject: [PATCH] linkcheck: make xml parsing more robust and fix types --- checks/linkcheck/main.py | 36 ++++++++++++++++++++++++++++++++++-- 1 file changed, 34 insertions(+), 2 deletions(-) diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index 8dc5270..4d00f78 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -6,6 +6,23 @@ import xml.etree.ElementTree as ET from typing import NoReturn +def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: + timestamp = revision.find("mw:timestamp", ns) + if timestamp is None: + print( + f"Timestamp tag not found in revision: {ET.tostring(revision)}", + file=sys.stderr, + ) + return "" + if timestamp.text is None: + print( + f"Timestamp text doesn't exist in revision: {ET.tostring(revision)}", + file=sys.stderr, + ) + return "" + return timestamp.text + + # filter out unimportant pages like Talk:, User:, and old revisions of posts def process_dump(dump_file: str, out_file: str) -> None: tree = ET.parse(dump_file) @@ -15,7 +32,17 @@ def process_dump(dump_file: str, out_file: str) -> None: ET.register_namespace("", ns["mw"]) for page in root.findall("mw:page", ns): - title = page.find("mw:title", ns).text + title_tag = page.find("mw:title", ns) + if title_tag is None: + print(f"Title tag not found in page: {ET.tostring(page)}", file=sys.stderr) + continue + title = title_tag.text + if title is None: + print( + f"Title text doesn't exist in page: {ET.tostring(page)}", + file=sys.stderr, + ) + continue if title.startswith("User:") or title.startswith("Talk:"): root.remove(page) @@ -25,7 +52,7 @@ def process_dump(dump_file: str, out_file: str) -> None: if len(revisions) > 1: latest_revision = max( - revisions, key=lambda rev: rev.find("mw:timestamp", ns).text + revisions, key=lambda revison: get_revision_timestamp(revison, ns) ) # Remove all revisions except the latest one @@ -58,6 +85,11 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None: for doc in root.findall("doc"): title = doc.attrib.get("title") + if title is None: + print( + f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr + ) + continue title = re.sub(r"\s+", "_", title) content = doc.text