diff --git a/checks/linkcheck/main.py b/checks/linkcheck/main.py index a7227c6..c949bce 100644 --- a/checks/linkcheck/main.py +++ b/checks/linkcheck/main.py @@ -2,8 +2,8 @@ import json import csv import re import sys +import argparse import xml.etree.ElementTree as ET -from typing import NoReturn def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: @@ -24,8 +24,8 @@ def get_revision_timestamp(revision: ET.Element, ns: dict[str, str]) -> str: # filter out unimportant pages like Talk:, User:, and old revisions of posts -def process_dump(dump_file: str, out_file: str) -> None: - tree = ET.parse(dump_file) +def process_dump( args: argparse.Namespace) -> None: + tree = ET.parse(args.dump_file) root = tree.getroot() ns = {"mw": "http://www.mediawiki.org/xml/export-0.11/"} @@ -60,22 +60,23 @@ def process_dump(dump_file: str, out_file: str) -> None: if revision != latest_revision: page.remove(revision) - tree.write(out_file, encoding="utf-8", xml_declaration=False) + tree.write(args.out_file, encoding="utf-8", xml_declaration=False) -def badlinks_print(known_file: str, outfile: str) -> None: - with open(known_file, "r") as infile, open(outfile, "w") as of: +def badlinks_print(args: argparse.Namespace) -> None: + # known_file: str, outfile: str) -> None: + with open(args.known_file, "r") as infile, open(args.outfile, "w") as of: for line in infile: stripped_line = line.strip() if stripped_line and not stripped_line.startswith("#"): of.write(f"--exclude {stripped_line} ") -def dump_link_map(jsonfile: str, dumpfile: str) -> None: - with open(jsonfile, "r") as json_file: +def dump_link_map(args: argparse.Namespace) -> None: + with open(args.jsonfile, "r") as json_file: fail_data = json.load(json_file) - with open(dumpfile, mode="w", newline="", encoding="utf-8") as csv_file: + with open(args.dumpfile, mode="w", newline="", encoding="utf-8") as csv_file: csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"') csv_writer.writerow(["STATUS", "URL", "WIKIURL"]) @@ -106,44 +107,31 @@ def dump_link_map(jsonfile: str, dumpfile: str) -> None: ) -def print_usage(status: int = 0) -> NoReturn: - print( - """ -Usage: python main.py [action] - [action] what? - —————————————————————————————————————————————————————————— - filter dumpxmlfile outxmlfile filter out unncesscary pages from dump - badlinks badlinksfile outfile parse and print known allowed.links - dumplinkmap jsonfile outfilelinkmap dumps a map of url and nixos article where it is present - help prints this help message and exits -""" - ) - sys.exit(status) - - def main() -> None: - if len(sys.argv) < 2: - print_usage(1) - action = sys.argv[1] - if action in "filter|badlinks|dumplinkmap": - if len(sys.argv) != 4: - print_usage(1) - if action == "filter": - dump_file = sys.argv[2] - out_file = sys.argv[3] - process_dump(dump_file, out_file) - elif action == "badlinks": - known_file = sys.argv[2] - out_file = sys.argv[3] - badlinks_print(known_file, out_file) - elif action == "dumplinkmap": - jsonfile = sys.argv[2] - dumpfile = sys.argv[3] - dump_link_map(jsonfile, dumpfile) - elif action in "--help": - print_usage(0) - else: - print_usage(1) + parser = argparse.ArgumentParser(description="Process wiki dump files") + subparsers = parser.add_subparsers() + parser_filter = subparsers.add_parser("filter", help="Filter out unimportant pages") + + parser_filter.add_argument("dump_file", type=str) + parser_filter.add_argument("out_file", type=str) + parser_filter.set_defaults(func=process_dump) + + parser_badlinks = subparsers.add_parser( + "badlinks", help="Parse and print known allowed links" + ) + parser_badlinks.add_argument("known_file", type=str) + parser_badlinks.add_argument("out_file", type=str) + parser_badlinks.set_defaults(func=badlinks_print) + + parser_dumplinkmap = subparsers.add_parser( + "dumplinkmap", help="Dump a map of url and nixos article where it is present" + ) + parser_dumplinkmap.add_argument("jsonfile", type=str) + parser_dumplinkmap.add_argument("dumpfile", type=str) + parser_dumplinkmap.set_defaults(func=dump_link_map) + + args = parser.parse_args() + args.func(args) if __name__ == "__main__":