-
Notifications
You must be signed in to change notification settings - Fork 28
/
util.py
126 lines (108 loc) · 3.66 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
import json
import collections
import re
import mwparserfromhell as mw
from typing import *
VERSION_EXTRACTOR = re.compile(r"(.*?)([0-9]+)?$")
COMMENT_PATTERN = re.compile("(<!--.*?-->)", flags=re.DOTALL)
def each_version(template_name: str, code, include_base: bool = False,
mergable_keys: List[str] = None) -> Iterator[Tuple[int, Dict[str, Any]]]:
"""
each_version is a generator that yields each version of an infobox
with variants, such as {{Infobox Item}} on [[Ring of charos]]
"""
if mergable_keys is None:
mergable_keys = ["version", "image", "caption"]
infoboxes = code.filter_templates(matches=lambda t: t.name.matches(template_name))
if len(infoboxes) < 1:
return
for infobox in infoboxes:
base: Dict[str, str] = {}
versions: Dict[int, Dict[str, str]] = {}
version_keys = {str(p.name).strip() for p in infobox.params if p.startswith("version")}
for i in range(1, 126):
if not f"version{i}" in version_keys:
break
versions[i] = {}
for param in infobox.params:
matcher = VERSION_EXTRACTOR.match(str(param.name).strip())
if matcher is None:
raise AssertionError()
primary = matcher.group(1)
dic = base
if matcher.group(2) != None:
version = int(matcher.group(2))
if not version in versions:
# ignore this version
continue
dic = versions[version]
dic[primary] = param.value
if len(versions) == 0:
yield (-1, base)
else:
all_mergable = True
for versionID, versionDict in versions.items():
for key in versionDict:
if not key in mergable_keys:
all_mergable = False
if all_mergable:
yield (-1, base)
else:
if include_base:
yield (-1, base)
for versionID, versionDict in versions.items():
yield (versionID, {**base, **versionDict})
def write_json(name: str, minName: str, docs: Dict[Any, Dict[str, Any]]):
items = []
for (id, doc) in docs.items():
named = {k: v for (k, v) in doc.items() if not k.startswith("__")}
nameless = named.copy()
if "name" in nameless:
del nameless["name"]
if nameless != {}:
items.append((id, named, nameless))
items.sort(key=lambda k: int(k[0]))
withNames = collections.OrderedDict([(k, v) for (k, v, _) in items])
with open(name, "w+") as fi:
json.dump(withNames, fi, indent=2)
withoutNames = collections.OrderedDict([(k, v) for (k, _, v) in items])
with open(minName, "w+") as fi:
json.dump(withoutNames, fi, separators=(",", ":"))
def get_doc_for_id_string(source: str, version: Dict[str, str], docs: Dict[str, Dict],
allow_duplicates: bool = False) -> Optional[Dict]:
if not "id" in version:
print("page {} is missing an id".format(source))
return None
ids = [id for id in map(lambda id: id.strip(), str(version["id"]).split(",")) if id != "" and id.isdigit()]
if len(ids) == 0:
print("page {} is has an empty id".format(source))
return None
doc = {}
doc["__source__"] = source
invalid = False
for id in ids:
if not allow_duplicates and id in docs:
print("page {} is has the same id as {}".format(source, docs[id]["__source__"]))
invalid = True
docs[id] = doc
if invalid:
return None
return doc
def copy(name: Union[str, Tuple[str, str]],
doc: Dict,
version: Dict[str, Any],
convert: Callable[[Any], Any] = lambda x: x) -> bool:
src_name = name if isinstance(name, str) else name[0]
dst_name = name if isinstance(name, str) else name[1]
if not src_name in version:
return False
strval = str(version[src_name]).strip()
if strval == "":
return False
newval = convert(re.sub(COMMENT_PATTERN, "", strval))
if not newval:
return False
doc[dst_name] = newval
return True
def has_template(name: str, code) -> bool:
return len(code.filter_templates(matches=lambda t: t.name.matches(name))) != 0