285 lines
7.9 KiB
Python
Executable File
285 lines
7.9 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
from __future__ import annotations
|
|
|
|
import argparse
|
|
import html
|
|
import json
|
|
import re
|
|
import sys
|
|
import unicodedata
|
|
from html.parser import HTMLParser
|
|
from pathlib import Path
|
|
from urllib.parse import unquote, urlparse
|
|
|
|
import requests
|
|
|
|
|
|
API_URL = "https://vi.wikipedia.org/w/api.php"
|
|
DEFAULT_OUTPUT_DIR = Path(__file__).resolve().parents[1] / "tmp" / "wiki"
|
|
USER_AGENT = "UltimateHistoryMapWikiImporter/1.0"
|
|
|
|
ALLOWED_TAGS = {
|
|
"p",
|
|
"blockquote",
|
|
"h2",
|
|
"h3",
|
|
"h4",
|
|
"h5",
|
|
"h6",
|
|
"ul",
|
|
"ol",
|
|
"li",
|
|
"b",
|
|
"strong",
|
|
"i",
|
|
"em",
|
|
"code",
|
|
"pre",
|
|
"a",
|
|
"br",
|
|
}
|
|
|
|
SKIP_TAGS = {
|
|
"audio",
|
|
"canvas",
|
|
"figure",
|
|
"form",
|
|
"iframe",
|
|
"img",
|
|
"input",
|
|
"map",
|
|
"math",
|
|
"meta",
|
|
"noscript",
|
|
"picture",
|
|
"script",
|
|
"style",
|
|
"svg",
|
|
"table",
|
|
"video",
|
|
}
|
|
|
|
SKIP_CLASS_PARTS = (
|
|
"ambox",
|
|
"authority-control",
|
|
"catlinks",
|
|
"error",
|
|
"hatnote",
|
|
"metadata",
|
|
"mw-editsection",
|
|
"mw-empty-elt",
|
|
"navbox",
|
|
"navigation-not-searchable",
|
|
"noprint",
|
|
"reference",
|
|
"reflist",
|
|
"shortdescription",
|
|
"sidebar",
|
|
"toc",
|
|
"vertical-navbox",
|
|
)
|
|
|
|
VOID_TAGS = {"br"}
|
|
|
|
|
|
class WikiHtmlSanitizer(HTMLParser):
|
|
def __init__(self) -> None:
|
|
super().__init__(convert_charrefs=False)
|
|
self.parts: list[str] = []
|
|
self.open_tags: list[str] = []
|
|
self.skip_depth = 0
|
|
|
|
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
tag = tag.lower()
|
|
if self.skip_depth:
|
|
self.skip_depth += 1
|
|
return
|
|
|
|
attr_map = {name.lower(): value or "" for name, value in attrs}
|
|
if tag in SKIP_TAGS or self._has_skipped_class(attr_map.get("class", "")):
|
|
self.skip_depth = 1
|
|
return
|
|
|
|
if tag not in ALLOWED_TAGS:
|
|
return
|
|
|
|
if tag == "a":
|
|
self.parts.append('<a href="__missing__">')
|
|
elif tag == "br":
|
|
self.parts.append("<br>")
|
|
return
|
|
else:
|
|
self.parts.append(f"<{tag}>")
|
|
self.open_tags.append(tag)
|
|
|
|
def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
|
|
tag = tag.lower()
|
|
if self.skip_depth:
|
|
return
|
|
attr_map = {name.lower(): value or "" for name, value in attrs}
|
|
if tag in SKIP_TAGS or self._has_skipped_class(attr_map.get("class", "")):
|
|
return
|
|
if tag == "br":
|
|
self.parts.append("<br>")
|
|
|
|
def handle_endtag(self, tag: str) -> None:
|
|
tag = tag.lower()
|
|
if self.skip_depth:
|
|
self.skip_depth -= 1
|
|
return
|
|
if tag not in ALLOWED_TAGS or tag in VOID_TAGS:
|
|
return
|
|
|
|
for index in range(len(self.open_tags) - 1, -1, -1):
|
|
if self.open_tags[index] == tag:
|
|
while len(self.open_tags) > index:
|
|
closing_tag = self.open_tags.pop()
|
|
self.parts.append(f"</{closing_tag}>")
|
|
return
|
|
|
|
def handle_data(self, data: str) -> None:
|
|
if self.skip_depth:
|
|
return
|
|
if not data:
|
|
return
|
|
self.parts.append(html.escape(data, quote=False))
|
|
|
|
def handle_entityref(self, name: str) -> None:
|
|
if self.skip_depth:
|
|
return
|
|
self.parts.append(f"&{name};")
|
|
|
|
def handle_charref(self, name: str) -> None:
|
|
if self.skip_depth:
|
|
return
|
|
self.parts.append(f"&#{name};")
|
|
|
|
def get_html(self) -> str:
|
|
while self.open_tags:
|
|
self.parts.append(f"</{self.open_tags.pop()}>")
|
|
return "".join(self.parts)
|
|
|
|
@staticmethod
|
|
def _has_skipped_class(class_value: str) -> bool:
|
|
classes = class_value.lower().split()
|
|
return any(any(part in cls for part in SKIP_CLASS_PARTS) for cls in classes)
|
|
|
|
|
|
def title_from_source(source: str) -> str:
|
|
parsed = urlparse(source)
|
|
if parsed.scheme and parsed.netloc:
|
|
if "/wiki/" in parsed.path:
|
|
return unquote(parsed.path.rsplit("/wiki/", 1)[1]).replace("_", " ")
|
|
raise ValueError(f"Unsupported Wikipedia URL: {source}")
|
|
return source.replace("_", " ").strip()
|
|
|
|
|
|
def slugify_title(title: str) -> str:
|
|
text = unicodedata.normalize("NFD", title.strip().lower())
|
|
text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
|
|
text = text.replace("đ", "d")
|
|
text = re.sub(r"[^a-z0-9]+", "-", text)
|
|
return text.strip("-") or "wiki"
|
|
|
|
|
|
def fetch_wikipedia_html(title: str) -> tuple[str, str]:
|
|
response = requests.get(
|
|
API_URL,
|
|
params={
|
|
"action": "parse",
|
|
"page": title,
|
|
"prop": "text",
|
|
"format": "json",
|
|
"formatversion": "2",
|
|
"redirects": "1",
|
|
"disableeditsection": "1",
|
|
},
|
|
headers={"User-Agent": USER_AGENT},
|
|
timeout=30,
|
|
)
|
|
response.raise_for_status()
|
|
payload = response.json()
|
|
if "error" in payload:
|
|
raise RuntimeError(json.dumps(payload["error"], ensure_ascii=False))
|
|
parsed = payload.get("parse") or {}
|
|
fetched_title = str(parsed.get("title") or title).strip()
|
|
article_html = str(parsed.get("text") or "")
|
|
if not article_html.strip():
|
|
raise RuntimeError(f"No article HTML returned for title: {title}")
|
|
return fetched_title, article_html
|
|
|
|
|
|
def sanitize_wikipedia_html(article_html: str) -> str:
|
|
parser = WikiHtmlSanitizer()
|
|
parser.feed(article_html)
|
|
parser.close()
|
|
content = html.unescape(parser.get_html())
|
|
content = normalize_fragment(content)
|
|
return content
|
|
|
|
|
|
def normalize_fragment(content: str) -> str:
|
|
content = re.sub(r"\r\n?", "\n", content)
|
|
content = re.sub(r"[ \t\f\v]+", " ", content)
|
|
content = re.sub(r"\s*\n\s*", "\n", content)
|
|
content = re.sub(r">\s+<", "><", content)
|
|
content = re.sub(r"<(p|li|h[2-6]|blockquote)>\s*</\1>", "", content)
|
|
content = re.sub(r"<(ul|ol)>\s*</\1>", "", content)
|
|
content = re.sub(r"(</(?:p|h[2-6]|ul|ol|li|blockquote|pre)>)", r"\1\n", content)
|
|
content = re.sub(r"\n{2,}", "\n", content)
|
|
return content.strip()
|
|
|
|
|
|
def put_first_paragraph_in_blockquote(content: str) -> str:
|
|
match = re.search(r"<p>(.*?)</p>", content, flags=re.S)
|
|
if not match:
|
|
return content
|
|
|
|
quote_inner = match.group(1).strip()
|
|
before = content[: match.start()].strip()
|
|
after = content[match.end() :].strip()
|
|
parts = []
|
|
if quote_inner:
|
|
parts.append(f"<blockquote>{quote_inner}</blockquote>")
|
|
if before:
|
|
parts.append(before)
|
|
if after:
|
|
parts.append(after)
|
|
return "\n".join(parts).strip()
|
|
|
|
|
|
def write_article(source: str, output_dir: Path, output_name: str | None = None) -> Path:
|
|
title = title_from_source(source)
|
|
fetched_title, article_html = fetch_wikipedia_html(title)
|
|
content = sanitize_wikipedia_html(article_html)
|
|
content = put_first_paragraph_in_blockquote(content)
|
|
|
|
filename = output_name or f"{slugify_title(fetched_title)}.html"
|
|
if not filename.endswith(".html"):
|
|
filename = f"{filename}.html"
|
|
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
output_path = output_dir / filename
|
|
output_path.write_text(content + "\n", encoding="utf-8")
|
|
return output_path
|
|
|
|
|
|
def main() -> int:
|
|
parser = argparse.ArgumentParser(description="Fetch a Vietnamese Wikipedia article into UHM wiki HTML format.")
|
|
parser.add_argument("source", help="Vietnamese Wikipedia URL or page title.")
|
|
parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
|
|
parser.add_argument("--output-name", help="Output filename. Defaults to a slug from the fetched title.")
|
|
args = parser.parse_args()
|
|
|
|
output_path = write_article(args.source, args.output_dir, args.output_name)
|
|
print(output_path)
|
|
return 0
|
|
|
|
|
|
if __name__ == "__main__":
|
|
try:
|
|
raise SystemExit(main())
|
|
except Exception as exc:
|
|
print(f"error: {exc}", file=sys.stderr)
|
|
raise SystemExit(1)
|