linkcheck: add argparse

This commit is contained in:
Jörg Thalheim 2024-09-17 10:07:55 +02:00
parent f66c272fc2
commit 7d16671ce2

View File

@ -2,8 +2,8 @@ import json
import csv import csv
import re import re
import sys import sys
import argparse
import xml.etree.ElementTree as ET import xml.etree.ElementTree as ET
from typing import NoReturn
def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
@ -24,8 +24,8 @@ def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str:
# filter out unimportant pages like Talk:, User:, and old revisions of posts # filter out unimportant pages like Talk:, User:, and old revisions of posts
def process_dump(dump_file: str, out_file: str) -> None: def process_dump( args: argparse.Namespace) -> None:
tree = ET.parse(dump_file) tree = ET.parse(args.dump_file)
root = tree.getroot() root = tree.getroot()
ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"} ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"}
@ -60,22 +60,23 @@ def process_dump(dump_file: str, out_file: str) -> None:
if revision != latest_revision: if revision != latest_revision:
page.remove(revision) page.remove(revision)
tree.write(out_file, encoding="utf-8", xml_declaration=False) tree.write(args.out_file, encoding="utf-8", xml_declaration=False)
def badlinks_print(known_file: str, outfile: str) -> None: def badlinks_print(args: argparse.Namespace) -> None:
with open(known_file, "r") as infile, open(outfile, "w") as of: # known_file: str, outfile: str) -> None:
with open(args.known_file, "r") as infile, open(args.outfile, "w") as of:
for line in infile: for line in infile:
stripped_line = line.strip() stripped_line = line.strip()
if stripped_line and not stripped_line.startswith("#"): if stripped_line and not stripped_line.startswith("#"):
of.write(f"--exclude {stripped_line} ") of.write(f"--exclude {stripped_line} ")
def dump_link_map(jsonfile: str, dumpfile: str) -> None: def dump_link_map(args: argparse.Namespace) -> None:
with open(jsonfile, "r") as json_file: with open(args.jsonfile, "r") as json_file:
fail_data = json.load(json_file) fail_data = json.load(json_file)
with open(dumpfile, mode="w", newline="", encoding="utf-8") as csv_file: with open(args.dumpfile, mode="w", newline="", encoding="utf-8") as csv_file:
csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"') csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
csv_writer.writerow(["STATUS", "URL", "WIKIURL"]) csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
@ -106,44 +107,31 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None:
) )
def print_usage(status: int = 0) -> NoReturn:
print(
"""
Usage: python main.py [action] <inputfile> <outfile>
[action] <inputfile> <outfile> what?
filter dumpxmlfile outxmlfile filter out unncesscary pages from dump
badlinks badlinksfile outfile parse and print known allowed.links
dumplinkmap jsonfile outfilelinkmap dumps a map of url and nixos article where it is present
help prints this help message and exits
"""
)
sys.exit(status)
def main() -> None: def main() -> None:
if len(sys.argv) < 2: parser = argparse.ArgumentParser(description="Process wiki dump files")
print_usage(1) subparsers = parser.add_subparsers()
action = sys.argv[1] parser_filter = subparsers.add_parser("filter", help="Filter out unimportant pages")
if action in "filter|badlinks|dumplinkmap":
if len(sys.argv) != 4: parser_filter.add_argument("dump_file", type=str)
print_usage(1) parser_filter.add_argument("out_file", type=str)
if action == "filter": parser_filter.set_defaults(func=process_dump)
dump_file = sys.argv[2]
out_file = sys.argv[3] parser_badlinks = subparsers.add_parser(
process_dump(dump_file, out_file) "badlinks", help="Parse and print known allowed links"
elif action == "badlinks": )
known_file = sys.argv[2] parser_badlinks.add_argument("known_file", type=str)
out_file = sys.argv[3] parser_badlinks.add_argument("out_file", type=str)
badlinks_print(known_file, out_file) parser_badlinks.set_defaults(func=badlinks_print)
elif action == "dumplinkmap":
jsonfile = sys.argv[2] parser_dumplinkmap = subparsers.add_parser(
dumpfile = sys.argv[3] "dumplinkmap", help="Dump a map of url and nixos article where it is present"
dump_link_map(jsonfile, dumpfile) )
elif action in "--help": parser_dumplinkmap.add_argument("jsonfile", type=str)
print_usage(0) parser_dumplinkmap.add_argument("dumpfile", type=str)
else: parser_dumplinkmap.set_defaults(func=dump_link_map)
print_usage(1)
args = parser.parse_args()
args.func(args)
if __name__ == "__main__": if __name__ == "__main__":