#!/usr/bin/env python3 from __future__ import annotations import argparse import html import json import re import sys import unicodedata from html.parser import HTMLParser from pathlib import Path from urllib.parse import unquote, urlparse import requests API_URL = "https://vi.wikipedia.org/w/api.php" DEFAULT_OUTPUT_DIR = Path(__file__).resolve().parents[1] / "tmp" / "wiki" USER_AGENT = "UltimateHistoryMapWikiImporter/1.0" ALLOWED_TAGS = { "p", "blockquote", "h2", "h3", "h4", "h5", "h6", "ul", "ol", "li", "b", "strong", "i", "em", "code", "pre", "a", "br", } SKIP_TAGS = { "audio", "canvas", "figure", "form", "iframe", "img", "input", "map", "math", "meta", "noscript", "picture", "script", "style", "svg", "table", "video", } SKIP_CLASS_PARTS = ( "ambox", "authority-control", "catlinks", "error", "hatnote", "metadata", "mw-editsection", "mw-empty-elt", "navbox", "navigation-not-searchable", "noprint", "reference", "reflist", "shortdescription", "sidebar", "toc", "vertical-navbox", ) VOID_TAGS = {"br"} class WikiHtmlSanitizer(HTMLParser): def __init__(self) -> None: super().__init__(convert_charrefs=False) self.parts: list[str] = [] self.open_tags: list[str] = [] self.skip_depth = 0 def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: tag = tag.lower() if self.skip_depth: self.skip_depth += 1 return attr_map = {name.lower(): value or "" for name, value in attrs} if tag in SKIP_TAGS or self._has_skipped_class(attr_map.get("class", "")): self.skip_depth = 1 return if tag not in ALLOWED_TAGS: return if tag == "a": self.parts.append('') elif tag == "br": self.parts.append("
") return else: self.parts.append(f"<{tag}>") self.open_tags.append(tag) def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None: tag = tag.lower() if self.skip_depth: return attr_map = {name.lower(): value or "" for name, value in attrs} if tag in SKIP_TAGS or self._has_skipped_class(attr_map.get("class", "")): return if tag == "br": self.parts.append("
") def handle_endtag(self, tag: str) -> None: tag = tag.lower() if self.skip_depth: self.skip_depth -= 1 return if tag not in ALLOWED_TAGS or tag in VOID_TAGS: return for index in range(len(self.open_tags) - 1, -1, -1): if self.open_tags[index] == tag: while len(self.open_tags) > index: closing_tag = self.open_tags.pop() self.parts.append(f"") return def handle_data(self, data: str) -> None: if self.skip_depth: return if not data: return self.parts.append(html.escape(data, quote=False)) def handle_entityref(self, name: str) -> None: if self.skip_depth: return self.parts.append(f"&{name};") def handle_charref(self, name: str) -> None: if self.skip_depth: return self.parts.append(f"&#{name};") def get_html(self) -> str: while self.open_tags: self.parts.append(f"") return "".join(self.parts) @staticmethod def _has_skipped_class(class_value: str) -> bool: classes = class_value.lower().split() return any(any(part in cls for part in SKIP_CLASS_PARTS) for cls in classes) def title_from_source(source: str) -> str: parsed = urlparse(source) if parsed.scheme and parsed.netloc: if "/wiki/" in parsed.path: return unquote(parsed.path.rsplit("/wiki/", 1)[1]).replace("_", " ") raise ValueError(f"Unsupported Wikipedia URL: {source}") return source.replace("_", " ").strip() def slugify_title(title: str) -> str: text = unicodedata.normalize("NFD", title.strip().lower()) text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn") text = text.replace("đ", "d") text = re.sub(r"[^a-z0-9]+", "-", text) return text.strip("-") or "wiki" def fetch_wikipedia_html(title: str) -> tuple[str, str]: response = requests.get( API_URL, params={ "action": "parse", "page": title, "prop": "text", "format": "json", "formatversion": "2", "redirects": "1", "disableeditsection": "1", }, headers={"User-Agent": USER_AGENT}, timeout=30, ) response.raise_for_status() payload = response.json() if "error" in payload: raise RuntimeError(json.dumps(payload["error"], ensure_ascii=False)) parsed = payload.get("parse") or {} fetched_title = str(parsed.get("title") or title).strip() article_html = str(parsed.get("text") or "") if not article_html.strip(): raise RuntimeError(f"No article HTML returned for title: {title}") return fetched_title, article_html def sanitize_wikipedia_html(article_html: str) -> str: parser = WikiHtmlSanitizer() parser.feed(article_html) parser.close() content = html.unescape(parser.get_html()) content = normalize_fragment(content) return content def normalize_fragment(content: str) -> str: content = re.sub(r"\r\n?", "\n", content) content = re.sub(r"[ \t\f\v]+", " ", content) content = re.sub(r"\s*\n\s*", "\n", content) content = re.sub(r">\s+<", "><", content) content = re.sub(r"<(p|li|h[2-6]|blockquote)>\s*", "", content) content = re.sub(r"<(ul|ol)>\s*", "", content) content = re.sub(r"()", r"\1\n", content) content = re.sub(r"\n{2,}", "\n", content) return content.strip() def put_first_paragraph_in_blockquote(content: str) -> str: match = re.search(r"

(.*?)

", content, flags=re.S) if not match: return content quote_inner = match.group(1).strip() before = content[: match.start()].strip() after = content[match.end() :].strip() parts = [] if quote_inner: parts.append(f"
{quote_inner}
") if before: parts.append(before) if after: parts.append(after) return "\n".join(parts).strip() def write_article(source: str, output_dir: Path, output_name: str | None = None) -> Path: title = title_from_source(source) fetched_title, article_html = fetch_wikipedia_html(title) content = sanitize_wikipedia_html(article_html) content = put_first_paragraph_in_blockquote(content) filename = output_name or f"{slugify_title(fetched_title)}.html" if not filename.endswith(".html"): filename = f"{filename}.html" output_dir.mkdir(parents=True, exist_ok=True) output_path = output_dir / filename output_path.write_text(content + "\n", encoding="utf-8") return output_path def main() -> int: parser = argparse.ArgumentParser(description="Fetch a Vietnamese Wikipedia article into UHM wiki HTML format.") parser.add_argument("source", help="Vietnamese Wikipedia URL or page title.") parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR) parser.add_argument("--output-name", help="Output filename. Defaults to a slug from the fetched title.") args = parser.parse_args() output_path = write_article(args.source, args.output_dir, args.output_name) print(output_path) return 0 if __name__ == "__main__": try: raise SystemExit(main()) except Exception as exc: print(f"error: {exc}", file=sys.stderr) raise SystemExit(1)