basic Uer UI

2026-06-06 15:36:15 +07:00
parent 61949e7149
commit 820e0b5216
65 changed files with 3460 additions and 1322 deletions
@@ -0,0 +1,284 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import html
+import json
+import re
+import sys
+import unicodedata
+from html.parser import HTMLParser
+from pathlib import Path
+from urllib.parse import unquote, urlparse
+
+import requests
+
+
+API_URL = "https://vi.wikipedia.org/w/api.php"
+DEFAULT_OUTPUT_DIR = Path(__file__).resolve().parents[1] / "tmp" / "wiki"
+USER_AGENT = "UltimateHistoryMapWikiImporter/1.0"
+
+ALLOWED_TAGS = {
+    "p",
+    "blockquote",
+    "h2",
+    "h3",
+    "h4",
+    "h5",
+    "h6",
+    "ul",
+    "ol",
+    "li",
+    "b",
+    "strong",
+    "i",
+    "em",
+    "code",
+    "pre",
+    "a",
+    "br",
+}
+
+SKIP_TAGS = {
+    "audio",
+    "canvas",
+    "figure",
+    "form",
+    "iframe",
+    "img",
+    "input",
+    "map",
+    "math",
+    "meta",
+    "noscript",
+    "picture",
+    "script",
+    "style",
+    "svg",
+    "table",
+    "video",
+}
+
+SKIP_CLASS_PARTS = (
+    "ambox",
+    "authority-control",
+    "catlinks",
+    "error",
+    "hatnote",
+    "metadata",
+    "mw-editsection",
+    "mw-empty-elt",
+    "navbox",
+    "navigation-not-searchable",
+    "noprint",
+    "reference",
+    "reflist",
+    "shortdescription",
+    "sidebar",
+    "toc",
+    "vertical-navbox",
+)
+
+VOID_TAGS = {"br"}
+
+
+class WikiHtmlSanitizer(HTMLParser):
+    def __init__(self) -> None:
+        super().__init__(convert_charrefs=False)
+        self.parts: list[str] = []
+        self.open_tags: list[str] = []
+        self.skip_depth = 0
+
+    def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        tag = tag.lower()
+        if self.skip_depth:
+            self.skip_depth += 1
+            return
+
+        attr_map = {name.lower(): value or "" for name, value in attrs}
+        if tag in SKIP_TAGS or self._has_skipped_class(attr_map.get("class", "")):
+            self.skip_depth = 1
+            return
+
+        if tag not in ALLOWED_TAGS:
+            return
+
+        if tag == "a":
+            self.parts.append('<a href="__missing__">')
+        elif tag == "br":
+            self.parts.append("<br>")
+            return
+        else:
+            self.parts.append(f"<{tag}>")
+        self.open_tags.append(tag)
+
+    def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
+        tag = tag.lower()
+        if self.skip_depth:
+            return
+        attr_map = {name.lower(): value or "" for name, value in attrs}
+        if tag in SKIP_TAGS or self._has_skipped_class(attr_map.get("class", "")):
+            return
+        if tag == "br":
+            self.parts.append("<br>")
+
+    def handle_endtag(self, tag: str) -> None:
+        tag = tag.lower()
+        if self.skip_depth:
+            self.skip_depth -= 1
+            return
+        if tag not in ALLOWED_TAGS or tag in VOID_TAGS:
+            return
+
+        for index in range(len(self.open_tags) - 1, -1, -1):
+            if self.open_tags[index] == tag:
+                while len(self.open_tags) > index:
+                    closing_tag = self.open_tags.pop()
+                    self.parts.append(f"</{closing_tag}>")
+                return
+
+    def handle_data(self, data: str) -> None:
+        if self.skip_depth:
+            return
+        if not data:
+            return
+        self.parts.append(html.escape(data, quote=False))
+
+    def handle_entityref(self, name: str) -> None:
+        if self.skip_depth:
+            return
+        self.parts.append(f"&{name};")
+
+    def handle_charref(self, name: str) -> None:
+        if self.skip_depth:
+            return
+        self.parts.append(f"&#{name};")
+
+    def get_html(self) -> str:
+        while self.open_tags:
+            self.parts.append(f"</{self.open_tags.pop()}>")
+        return "".join(self.parts)
+
+    @staticmethod
+    def _has_skipped_class(class_value: str) -> bool:
+        classes = class_value.lower().split()
+        return any(any(part in cls for part in SKIP_CLASS_PARTS) for cls in classes)
+
+
+def title_from_source(source: str) -> str:
+    parsed = urlparse(source)
+    if parsed.scheme and parsed.netloc:
+        if "/wiki/" in parsed.path:
+            return unquote(parsed.path.rsplit("/wiki/", 1)[1]).replace("_", " ")
+        raise ValueError(f"Unsupported Wikipedia URL: {source}")
+    return source.replace("_", " ").strip()
+
+
+def slugify_title(title: str) -> str:
+    text = unicodedata.normalize("NFD", title.strip().lower())
+    text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
+    text = text.replace("đ", "d")
+    text = re.sub(r"[^a-z0-9]+", "-", text)
+    return text.strip("-") or "wiki"
+
+
+def fetch_wikipedia_html(title: str) -> tuple[str, str]:
+    response = requests.get(
+        API_URL,
+        params={
+            "action": "parse",
+            "page": title,
+            "prop": "text",
+            "format": "json",
+            "formatversion": "2",
+            "redirects": "1",
+            "disableeditsection": "1",
+        },
+        headers={"User-Agent": USER_AGENT},
+        timeout=30,
+    )
+    response.raise_for_status()
+    payload = response.json()
+    if "error" in payload:
+        raise RuntimeError(json.dumps(payload["error"], ensure_ascii=False))
+    parsed = payload.get("parse") or {}
+    fetched_title = str(parsed.get("title") or title).strip()
+    article_html = str(parsed.get("text") or "")
+    if not article_html.strip():
+        raise RuntimeError(f"No article HTML returned for title: {title}")
+    return fetched_title, article_html
+
+
+def sanitize_wikipedia_html(article_html: str) -> str:
+    parser = WikiHtmlSanitizer()
+    parser.feed(article_html)
+    parser.close()
+    content = html.unescape(parser.get_html())
+    content = normalize_fragment(content)
+    return content
+
+
+def normalize_fragment(content: str) -> str:
+    content = re.sub(r"\r\n?", "\n", content)
+    content = re.sub(r"[ \t\f\v]+", " ", content)
+    content = re.sub(r"\s*\n\s*", "\n", content)
+    content = re.sub(r">\s+<", "><", content)
+    content = re.sub(r"<(p|li|h[2-6]|blockquote)>\s*</\1>", "", content)
+    content = re.sub(r"<(ul|ol)>\s*</\1>", "", content)
+    content = re.sub(r"(</(?:p|h[2-6]|ul|ol|li|blockquote|pre)>)", r"\1\n", content)
+    content = re.sub(r"\n{2,}", "\n", content)
+    return content.strip()
+
+
+def put_first_paragraph_in_blockquote(content: str) -> str:
+    match = re.search(r"<p>(.*?)</p>", content, flags=re.S)
+    if not match:
+        return content
+
+    quote_inner = match.group(1).strip()
+    before = content[: match.start()].strip()
+    after = content[match.end() :].strip()
+    parts = []
+    if quote_inner:
+        parts.append(f"<blockquote>{quote_inner}</blockquote>")
+    if before:
+        parts.append(before)
+    if after:
+        parts.append(after)
+    return "\n".join(parts).strip()
+
+
+def write_article(source: str, output_dir: Path, output_name: str | None = None) -> Path:
+    title = title_from_source(source)
+    fetched_title, article_html = fetch_wikipedia_html(title)
+    content = sanitize_wikipedia_html(article_html)
+    content = put_first_paragraph_in_blockquote(content)
+
+    filename = output_name or f"{slugify_title(fetched_title)}.html"
+    if not filename.endswith(".html"):
+        filename = f"{filename}.html"
+
+    output_dir.mkdir(parents=True, exist_ok=True)
+    output_path = output_dir / filename
+    output_path.write_text(content + "\n", encoding="utf-8")
+    return output_path
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(description="Fetch a Vietnamese Wikipedia article into UHM wiki HTML format.")
+    parser.add_argument("source", help="Vietnamese Wikipedia URL or page title.")
+    parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
+    parser.add_argument("--output-name", help="Output filename. Defaults to a slug from the fetched title.")
+    args = parser.parse_args()
+
+    output_path = write_article(args.source, args.output_dir, args.output_name)
+    print(output_path)
+    return 0
+
+
+if __name__ == "__main__":
+    try:
+        raise SystemExit(main())
+    except Exception as exc:
+        print(f"error: {exc}", file=sys.stderr)
+        raise SystemExit(1)