mirror of
https://github.com/Mic92/nixos-wiki-infra.git
synced 2024-11-21 15:53:34 +01:00
linkcheck: make xml parsing more robust and fix types
This commit is contained in:
parent
4f79bc4c70
commit
2d5336d3ed
@ -6,6 +6,23 @@ import xml.etree.ElementTree as ET
|
||||
from typing import NoReturn
|
||||
|
||||
|
||||
def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
|
||||
timestamp = revision.find("mw:timestamp", ns)
|
||||
if timestamp is None:
|
||||
print(
|
||||
f"Timestamp tag not found in revision: {ET.tostring(revision)}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return ""
|
||||
if timestamp.text is None:
|
||||
print(
|
||||
f"Timestamp text doesn't exist in revision: {ET.tostring(revision)}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
return ""
|
||||
return timestamp.text
|
||||
|
||||
|
||||
# filter out unimportant pages like Talk:, User:, and old revisions of posts
|
||||
def process_dump(dump_file: str, out_file: str) -> None:
|
||||
tree = ET.parse(dump_file)
|
||||
@ -15,7 +32,17 @@ def process_dump(dump_file: str, out_file: str) -> None:
|
||||
ET.register_namespace("", ns["mw"])
|
||||
|
||||
for page in root.findall("mw:page", ns):
|
||||
title = page.find("mw:title", ns).text
|
||||
title_tag = page.find("mw:title", ns)
|
||||
if title_tag is None:
|
||||
print(f"Title tag not found in page: {ET.tostring(page)}", file=sys.stderr)
|
||||
continue
|
||||
title = title_tag.text
|
||||
if title is None:
|
||||
print(
|
||||
f"Title text doesn't exist in page: {ET.tostring(page)}",
|
||||
file=sys.stderr,
|
||||
)
|
||||
continue
|
||||
|
||||
if title.startswith("User:") or title.startswith("Talk:"):
|
||||
root.remove(page)
|
||||
@ -25,7 +52,7 @@ def process_dump(dump_file: str, out_file: str) -> None:
|
||||
|
||||
if len(revisions) > 1:
|
||||
latest_revision = max(
|
||||
revisions, key=lambda rev: rev.find("mw:timestamp", ns).text
|
||||
revisions, key=lambda revison: get_revision_timestamp(revison, ns)
|
||||
)
|
||||
|
||||
# Remove all revisions except the latest one
|
||||
@ -58,6 +85,11 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None:
|
||||
|
||||
for doc in root.findall("doc"):
|
||||
title = doc.attrib.get("title")
|
||||
if title is None:
|
||||
print(
|
||||
f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr
|
||||
)
|
||||
continue
|
||||
title = re.sub(r"\s+", "_", title)
|
||||
content = doc.text
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user