linkcheck: make xml parsing more robust and fix types

This commit is contained in:
Jörg Thalheim 2024-09-17 10:02:39 +02:00
parent 4f79bc4c70
commit 2d5336d3ed

View File

@ -6,6 +6,23 @@ import xml.etree.ElementTree as ET
from typing import NoReturn
def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
timestamp = revision.find("mw:timestamp", ns)
if timestamp is None:
print(
f"Timestamp tag not found in revision: {ET.tostring(revision)}",
file=sys.stderr,
)
return ""
if timestamp.text is None:
print(
f"Timestamp text doesn't exist in revision: {ET.tostring(revision)}",
file=sys.stderr,
)
return ""
return timestamp.text
# filter out unimportant pages like Talk:, User:, and old revisions of posts
def process_dump(dump_file: str, out_file: str) -> None:
tree = ET.parse(dump_file)
@ -15,7 +32,17 @@ def process_dump(dump_file: str, out_file: str) -> None:
ET.register_namespace("", ns["mw"])
for page in root.findall("mw:page", ns):
title = page.find("mw:title", ns).text
title_tag = page.find("mw:title", ns)
if title_tag is None:
print(f"Title tag not found in page: {ET.tostring(page)}", file=sys.stderr)
continue
title = title_tag.text
if title is None:
print(
f"Title text doesn't exist in page: {ET.tostring(page)}",
file=sys.stderr,
)
continue
if title.startswith("User:") or title.startswith("Talk:"):
root.remove(page)
@ -25,7 +52,7 @@ def process_dump(dump_file: str, out_file: str) -> None:
if len(revisions) > 1:
latest_revision = max(
revisions, key=lambda rev: rev.find("mw:timestamp", ns).text
revisions, key=lambda revison: get_revision_timestamp(revison, ns)
)
# Remove all revisions except the latest one
@ -58,6 +85,11 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None:
for doc in root.findall("doc"):
title = doc.attrib.get("title")
if title is None:
print(
f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr
)
continue
title = re.sub(r"\s+", "_", title)
content = doc.text