-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathhebtokenizer.py
128 lines (99 loc) · 3.92 KB
/
hebtokenizer.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
#!/usr/bin/env python
# encoding: utf8
## Copyright 2010 Yoav Goldberg
##
## This program is free software: you can redistribute it and/or modify
## it under the terms of the GNU General Public License as published by
## the Free Software Foundation, either version 3 of the License, or
## (at your option) any later version.
##
## This program is distributed in the hope that it will be useful,
## but WITHOUT ANY WARRANTY; without even the implied warranty of
## MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
## GNU General Public License for more details.
##
## You should have received a copy of the GNU General Public License
## along with this program. If not, see <http://www.gnu.org/licenses/>.
"""
Robust Hebrew Tokenizer
works as a filter:
hebtokenizer.py < in > out
run as:
hebtokenizer.py -h
for options
KNOWN ISSUES:
- NOT VERY FAST!!!
- transition from hebrew words to numbers: ב-23:00 will be cut as ב-23 :00
- deliberately not segmenting משהוכלב from start of words before numbers/quotes/dashes
- emoticons are not supported (treated as punctuation)
- ' is always kept at end of hebrew chunks (a document level pre/post processing could help here)
- !!!!!!!111111 are split to !!!!!!!! 1111111
"""
#########
import re
import codecs
def heb(s,t): return ('HEB',t)
def eng(s,t): return ('ENG',t)
def num(s,t): return ('NUM',t)
def url(s,t): return ('URL',t)
def punct(s,t): return ('PUNCT',t)
def junk(s,t): return ('JUNK',t)
#### patterns
_NIKUD = u"\u05b0-\u05c4"
_TEAMIM= u"\u0591-\u05af"
undigraph = lambda x:x.replace(u"\u05f0",u"וו").replace(u"\u05f1",u"וי").replace("\u05f2","יי").replace("\ufb4f","אל").replace(u"\u200d","")
_heb_letter = ur"([א-ת%s]|[דגזצתט]')" % _NIKUD
# a heb word including single quotes, dots and dashes / this leaves last-dash out of the word
_heb_word_plus = ur"[א-ת%s]([.'`\"\-/\\]?['`]?[א-ת%s0-9'`])*" % (_NIKUD,_NIKUD)
# english/latin words (do not care about abbreviations vs. eos for english)
_eng_word = ur"[a-zA-Z][a-zA-Z0-9'.]*"
# numerical expression (numbers and various separators)
#_numeric = r"[+-]?[0-9.,/\-:]*[0-9%]"
_numeric = r"[+-]?([0-9][0-9.,/\-:]*)?[0-9]%?"
# url
_url = r"[a-z]+://\S+"
# punctuations
_opening_punc = r"[\[('`\"{]"
_closing_punc = r"[\])'`\"}]"
_eos_punct = r"[!?.]+"
_internal_punct = r"[,;:\-&]"
# junk
#_junk = ur"[^א-ת%sa-zA-Z0-9%%&!?.,;:\-()\[\]{}\"'\/\\+]+" #% _NIKUD
_junk = ur"[^א-ת%sa-zA-Z0-9!?.,:;\-()\[\]{}]+" % _NIKUD #%%&!?.,;:\-()\[\]{}\"'\/\\+]+" #% _NIKUD
is_all_heb = re.compile(ur"^%s+$" % (_heb_letter),re.UNICODE).match
is_a_number = re.compile(r"^%s$" % _numeric ,re.UNICODE).match
is_all_lat= re.compile(r"^[a-zA-Z]+$",re.UNICODE).match
is_sep = re.compile(r"^\|+$").match
is_punct = re.compile(r"^[.?!]+").match
#### scanner
scanner = re.Scanner([
(r"\s+", None),
(_url, url),
(_heb_word_plus, heb),
(_eng_word, eng),
(_numeric, num),
(_opening_punc, punct),
(_closing_punc, punct),
(_eos_punct, punct),
(_internal_punct, punct),
(_junk, junk),
])
##### tokenize
def tokenize(sent):
tok = sent
parts,reminder = scanner.scan(tok)
assert(not reminder)
return parts
if __name__=='__main__':
import sys
from itertools import islice
from optparse import OptionParser
parser = OptionParser("%prog [options] < in_file > out_file")
parser.add_option("-i","--ie",help="input encoding [default %default]",dest="in_enc",default="utf_8_sig")
parser.add_option("-o","--oe",help="output encoding [default %default]",dest="out_enc",default="utf_8")
opts, args = parser.parse_args()
#FILTER = set(['JUNK','ENG'])
FILTER = set()
for sent in codecs.getreader(opts.in_enc)(sys.stdin):
#print u"\n".join(["%s %s" % (which,tok) for which,tok in tokenize(sent) if which not in FILTER]).encode("utf8")
print " ".join([tok for (which,tok) in tokenize(sent)]).encode(opts.out_enc)