Files
History-user/scripts/fetch_vi_wiki.py
T
taDuc 820e0b5216
Build and Release / release (push) Successful in 1m30s
basic Uer UI
2026-06-06 15:36:15 +07:00

285 lines
7.9 KiB
Python
Executable File

#!/usr/bin/env python3
from __future__ import annotations
import argparse
import html
import json
import re
import sys
import unicodedata
from html.parser import HTMLParser
from pathlib import Path
from urllib.parse import unquote, urlparse
import requests
API_URL = "https://vi.wikipedia.org/w/api.php"
DEFAULT_OUTPUT_DIR = Path(__file__).resolve().parents[1] / "tmp" / "wiki"
USER_AGENT = "UltimateHistoryMapWikiImporter/1.0"
ALLOWED_TAGS = {
"p",
"blockquote",
"h2",
"h3",
"h4",
"h5",
"h6",
"ul",
"ol",
"li",
"b",
"strong",
"i",
"em",
"code",
"pre",
"a",
"br",
}
SKIP_TAGS = {
"audio",
"canvas",
"figure",
"form",
"iframe",
"img",
"input",
"map",
"math",
"meta",
"noscript",
"picture",
"script",
"style",
"svg",
"table",
"video",
}
SKIP_CLASS_PARTS = (
"ambox",
"authority-control",
"catlinks",
"error",
"hatnote",
"metadata",
"mw-editsection",
"mw-empty-elt",
"navbox",
"navigation-not-searchable",
"noprint",
"reference",
"reflist",
"shortdescription",
"sidebar",
"toc",
"vertical-navbox",
)
VOID_TAGS = {"br"}
class WikiHtmlSanitizer(HTMLParser):
def __init__(self) -> None:
super().__init__(convert_charrefs=False)
self.parts: list[str] = []
self.open_tags: list[str] = []
self.skip_depth = 0
def handle_starttag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
tag = tag.lower()
if self.skip_depth:
self.skip_depth += 1
return
attr_map = {name.lower(): value or "" for name, value in attrs}
if tag in SKIP_TAGS or self._has_skipped_class(attr_map.get("class", "")):
self.skip_depth = 1
return
if tag not in ALLOWED_TAGS:
return
if tag == "a":
self.parts.append('<a href="__missing__">')
elif tag == "br":
self.parts.append("<br>")
return
else:
self.parts.append(f"<{tag}>")
self.open_tags.append(tag)
def handle_startendtag(self, tag: str, attrs: list[tuple[str, str | None]]) -> None:
tag = tag.lower()
if self.skip_depth:
return
attr_map = {name.lower(): value or "" for name, value in attrs}
if tag in SKIP_TAGS or self._has_skipped_class(attr_map.get("class", "")):
return
if tag == "br":
self.parts.append("<br>")
def handle_endtag(self, tag: str) -> None:
tag = tag.lower()
if self.skip_depth:
self.skip_depth -= 1
return
if tag not in ALLOWED_TAGS or tag in VOID_TAGS:
return
for index in range(len(self.open_tags) - 1, -1, -1):
if self.open_tags[index] == tag:
while len(self.open_tags) > index:
closing_tag = self.open_tags.pop()
self.parts.append(f"</{closing_tag}>")
return
def handle_data(self, data: str) -> None:
if self.skip_depth:
return
if not data:
return
self.parts.append(html.escape(data, quote=False))
def handle_entityref(self, name: str) -> None:
if self.skip_depth:
return
self.parts.append(f"&{name};")
def handle_charref(self, name: str) -> None:
if self.skip_depth:
return
self.parts.append(f"&#{name};")
def get_html(self) -> str:
while self.open_tags:
self.parts.append(f"</{self.open_tags.pop()}>")
return "".join(self.parts)
@staticmethod
def _has_skipped_class(class_value: str) -> bool:
classes = class_value.lower().split()
return any(any(part in cls for part in SKIP_CLASS_PARTS) for cls in classes)
def title_from_source(source: str) -> str:
parsed = urlparse(source)
if parsed.scheme and parsed.netloc:
if "/wiki/" in parsed.path:
return unquote(parsed.path.rsplit("/wiki/", 1)[1]).replace("_", " ")
raise ValueError(f"Unsupported Wikipedia URL: {source}")
return source.replace("_", " ").strip()
def slugify_title(title: str) -> str:
text = unicodedata.normalize("NFD", title.strip().lower())
text = "".join(ch for ch in text if unicodedata.category(ch) != "Mn")
text = text.replace("đ", "d")
text = re.sub(r"[^a-z0-9]+", "-", text)
return text.strip("-") or "wiki"
def fetch_wikipedia_html(title: str) -> tuple[str, str]:
response = requests.get(
API_URL,
params={
"action": "parse",
"page": title,
"prop": "text",
"format": "json",
"formatversion": "2",
"redirects": "1",
"disableeditsection": "1",
},
headers={"User-Agent": USER_AGENT},
timeout=30,
)
response.raise_for_status()
payload = response.json()
if "error" in payload:
raise RuntimeError(json.dumps(payload["error"], ensure_ascii=False))
parsed = payload.get("parse") or {}
fetched_title = str(parsed.get("title") or title).strip()
article_html = str(parsed.get("text") or "")
if not article_html.strip():
raise RuntimeError(f"No article HTML returned for title: {title}")
return fetched_title, article_html
def sanitize_wikipedia_html(article_html: str) -> str:
parser = WikiHtmlSanitizer()
parser.feed(article_html)
parser.close()
content = html.unescape(parser.get_html())
content = normalize_fragment(content)
return content
def normalize_fragment(content: str) -> str:
content = re.sub(r"\r\n?", "\n", content)
content = re.sub(r"[ \t\f\v]+", " ", content)
content = re.sub(r"\s*\n\s*", "\n", content)
content = re.sub(r">\s+<", "><", content)
content = re.sub(r"<(p|li|h[2-6]|blockquote)>\s*</\1>", "", content)
content = re.sub(r"<(ul|ol)>\s*</\1>", "", content)
content = re.sub(r"(</(?:p|h[2-6]|ul|ol|li|blockquote|pre)>)", r"\1\n", content)
content = re.sub(r"\n{2,}", "\n", content)
return content.strip()
def put_first_paragraph_in_blockquote(content: str) -> str:
match = re.search(r"<p>(.*?)</p>", content, flags=re.S)
if not match:
return content
quote_inner = match.group(1).strip()
before = content[: match.start()].strip()
after = content[match.end() :].strip()
parts = []
if quote_inner:
parts.append(f"<blockquote>{quote_inner}</blockquote>")
if before:
parts.append(before)
if after:
parts.append(after)
return "\n".join(parts).strip()
def write_article(source: str, output_dir: Path, output_name: str | None = None) -> Path:
title = title_from_source(source)
fetched_title, article_html = fetch_wikipedia_html(title)
content = sanitize_wikipedia_html(article_html)
content = put_first_paragraph_in_blockquote(content)
filename = output_name or f"{slugify_title(fetched_title)}.html"
if not filename.endswith(".html"):
filename = f"{filename}.html"
output_dir.mkdir(parents=True, exist_ok=True)
output_path = output_dir / filename
output_path.write_text(content + "\n", encoding="utf-8")
return output_path
def main() -> int:
parser = argparse.ArgumentParser(description="Fetch a Vietnamese Wikipedia article into UHM wiki HTML format.")
parser.add_argument("source", help="Vietnamese Wikipedia URL or page title.")
parser.add_argument("--output-dir", type=Path, default=DEFAULT_OUTPUT_DIR)
parser.add_argument("--output-name", help="Output filename. Defaults to a slug from the fetched title.")
args = parser.parse_args()
output_path = write_article(args.source, args.output_dir, args.output_name)
print(output_path)
return 0
if __name__ == "__main__":
try:
raise SystemExit(main())
except Exception as exc:
print(f"error: {exc}", file=sys.stderr)
raise SystemExit(1)