mirror of
https://github.com/Mic92/nixos-wiki-infra.git
synced 2025-02-16 10:21:44 +01:00
linkcheck: sort csv in python
This commit is contained in:
parent
2be4de6fc6
commit
7c9a68ff76
@ -81,13 +81,6 @@ timeout 30 lychee -E \
|
|||||||
# csv of status, url, corresponding wiki page link
|
# csv of status, url, corresponding wiki page link
|
||||||
python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv
|
python3 ../main.py dumplinkmap lychee.json failed-wiki-links.csv
|
||||||
|
|
||||||
# sort for consistency
|
|
||||||
{
|
|
||||||
head -n 1 failed-wiki-links.csv
|
|
||||||
tail -n +2 failed-wiki-links.csv | sort -t$'\t' -k2,2
|
|
||||||
} >sorted_filename.tsv
|
|
||||||
mv sorted_filename.tsv failed-wiki-links.csv
|
|
||||||
|
|
||||||
cat failed-wiki-links.csv
|
cat failed-wiki-links.csv
|
||||||
|
|
||||||
dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
|
dest="../lychee-$(printf '%(%Y-%m-%d)T\n')-report"
|
||||||
|
@ -3,6 +3,7 @@ import csv
|
|||||||
import re
|
import re
|
||||||
import sys
|
import sys
|
||||||
import argparse
|
import argparse
|
||||||
|
import bisect
|
||||||
import xml.etree.ElementTree as ET
|
import xml.etree.ElementTree as ET
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
@ -72,38 +73,43 @@ def badlinks_print(args: argparse.Namespace) -> None:
|
|||||||
of.write(f"--exclude {stripped_line} ")
|
of.write(f"--exclude {stripped_line} ")
|
||||||
|
|
||||||
|
|
||||||
|
def read_lychee_file(lychee_file: Path) -> list[list[str]]:
|
||||||
|
fail_data = json.loads(lychee_file.read_text())
|
||||||
|
failed_urls = []
|
||||||
|
for xml_file, failed_url_entries in fail_data["fail_map"].items():
|
||||||
|
with open(xml_file, "r", encoding="utf-8") as xmlf:
|
||||||
|
root = ET.fromstring(f"<root>{xmlf.read()}</root>")
|
||||||
|
for doc in root.findall("doc"):
|
||||||
|
title = doc.attrib.get("title")
|
||||||
|
if title is None:
|
||||||
|
print(f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr)
|
||||||
|
continue
|
||||||
|
title = re.sub(r"\s+", "_", title)
|
||||||
|
content = doc.text
|
||||||
|
for entry in failed_url_entries:
|
||||||
|
url = entry["url"]
|
||||||
|
status = entry.get("status", {}).get("code", 403)
|
||||||
|
if url in content:
|
||||||
|
bisect.insort(
|
||||||
|
failed_urls,
|
||||||
|
[
|
||||||
|
status,
|
||||||
|
url,
|
||||||
|
f"https://wiki.nixos.org/wiki/{title}",
|
||||||
|
],
|
||||||
|
)
|
||||||
|
return failed_urls
|
||||||
|
|
||||||
|
|
||||||
def dump_link_map(args: argparse.Namespace) -> None:
|
def dump_link_map(args: argparse.Namespace) -> None:
|
||||||
fail_data = json.loads(args.json_file.read_text())
|
failed_urls = read_lychee_file(args.json_file)
|
||||||
|
|
||||||
with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file:
|
with args.dump_file.open(mode="w", newline="", encoding="utf-8") as csv_file:
|
||||||
csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
|
csv_writer = csv.writer(csv_file, delimiter="\t", quotechar='"')
|
||||||
csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
|
csv_writer.writerow(["STATUS", "URL", "WIKIURL"])
|
||||||
|
|
||||||
for xml_file, failed_url_entries in fail_data["fail_map"].items():
|
for item in failed_urls:
|
||||||
with open(xml_file, "r", encoding="utf-8") as xmlf:
|
csv_writer.writerow(item)
|
||||||
root = ET.fromstring(f"<root>{xmlf.read()}</root>")
|
|
||||||
|
|
||||||
for doc in root.findall("doc"):
|
|
||||||
title = doc.attrib.get("title")
|
|
||||||
if title is None:
|
|
||||||
print(
|
|
||||||
f"Title not found in doc: {ET.tostring(doc)}", file=sys.stderr
|
|
||||||
)
|
|
||||||
continue
|
|
||||||
title = re.sub(r"\s+", "_", title)
|
|
||||||
content = doc.text
|
|
||||||
|
|
||||||
for entry in failed_url_entries:
|
|
||||||
url = entry["url"]
|
|
||||||
status = entry.get("status", {}).get("code", 403)
|
|
||||||
if url in content:
|
|
||||||
csv_writer.writerow(
|
|
||||||
[
|
|
||||||
status,
|
|
||||||
url,
|
|
||||||
f"https://wiki.nixos.org/wiki/{title}",
|
|
||||||
]
|
|
||||||
)
|
|
||||||
|
|
||||||
|
|
||||||
def main() -> None:
|
def main() -> None:
|
||||||
|
Loading…
Reference in New Issue
Block a user