-
Notifications
You must be signed in to change notification settings - Fork 4
/
ngrams.py
148 lines (126 loc) · 4.04 KB
/
ngrams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
from collections import defaultdict
import itertools
MIN_FREQ = 10
LETTERS_ONLY = False
def extract_ngrams(
text,
n,
*,
frequencies=None,
cutoff=0.999,
min_freq=MIN_FREQ,
encoding="utf-8",
letters_only=False
):
if frequencies is None:
frequencies = itertools.cycle([min_freq])
ngrams = defaultdict(int)
for word, freq in zip(text, frequencies):
freq = int(freq)
if freq < min_freq:
continue
try:
word = word.strip().decode(encoding)
except UnicodeDecodeError:
continue
freq = int(freq)
for i in range(len(word) - n + 1):
ngrams[word[i : i + n]] += freq
if letters_only:
for ngram in list(ngrams.keys()):
if any(not c.isalpha() for c in ngram):
del ngrams[ngram]
ngrams = dict(sorted(((k, v) for k, v in ngrams.items()), key=lambda kv: -kv[1]))
total = sum(ngrams.values())
ngrams = dict((k, v / total) for k, v in ngrams.items())
total = 0
new_ngrams = {}
for k, v in ngrams.items():
total += v
if total > cutoff:
break
new_ngrams[k] = v
return new_ngrams
def extract_ngrams_from_file(filename, *kargs, **kwargs):
frqfile = None
try:
txtfile = open(filename, "rb")
except FileNotFoundError:
try:
import bz2
# Assume harfbuzz-testing-wikipedia format
txtfile = bz2.open(filename + ".txt.bz2").read().splitlines()
frqfile = bz2.open(filename + ".frq.bz2").read().splitlines()
except FileNotFoundError:
try:
# Assume hunspell dictionary format;
filename2 = filename
if filename.endswith(".aff"):
filename2 = filename[:-4]
elif filename.endswith(".dic"):
filename2 = filename[:-4]
afffile = open(filename2 + ".aff", "rb")
for line in afffile:
if line.startswith(b"SET"):
kwargs["encoding"] = (
line.replace(b"\t", b" ").split()[1].decode("ascii")
)
break
txtfile = open(filename2 + ".dic", "rb")
next(txtfile) # Skip over the num entries line
txtfile = (
s if s.find(b"/") == -1 else s[: s.find(b"/")] for s in txtfile
)
except FileNotFoundError:
raise FileNotFoundError("File not found: %s" % filename)
return extract_ngrams(txtfile, *kargs, frequencies=frqfile, **kwargs)
if __name__ == "__main__":
import sys
import argparse
parser = argparse.ArgumentParser(
"python3 ngrams.py",
description="Find ngrams from a language dictionary.",
)
parser.add_argument("dict", metavar="dict", nargs="+", help="Dictionary file.")
parser.add_argument(
"-n",
"--ngram",
type=int,
help="Length of ngrams. Default: 2",
)
parser.add_argument(
"-c",
"--cutoff",
type=float,
help="Cutoff probability. Default: .999",
)
parser.add_argument(
"-e",
"--encoding",
type=str,
help="Text encoding. Default: utf-8",
)
parser.add_argument(
"-l",
"--letters-only",
action="store_true",
help="Only list ngrams of letters. Default: False",
)
options = parser.parse_args(sys.argv[1:])
dictfiles = options.dict
encoding = options.encoding or "utf-8"
ngram = options.ngram or 2
cutoff = options.cutoff or 0.999
all_ngrams = defaultdict(float)
for dictfile in dictfiles:
ngrams = extract_ngrams_from_file(
dictfile,
ngram,
cutoff=cutoff,
encoding=encoding,
letters_only=options.letters_only,
)
for k, v in ngrams.items():
all_ngrams[k] += v
for ngram, freq in all_ngrams.items():
print(ngram, freq)