-
Notifications
You must be signed in to change notification settings - Fork 0
/
filter_sentences.py
72 lines (51 loc) · 1.6 KB
/
filter_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
"""
This script should filter trash sentences
"""
import sys
import re
input_filename = sys.argv[1]
output_filename = sys.argv[2]
def check_mention(mention):
word_count = mention.count(' ')
if word_count > 3:
return False
length = len(mention)
if length > 50:
return False
numbers = sum(c.isdigit() for c in mention)
chars = sum(c.isalpha() for c in mention)
other = length - numbers - chars
if numbers > chars:
return False
if other > chars:
return False
if re.match(r'^[\s\d\w\-\,]*$', mention) is None:
return False
return True
def convert_concept(concept):
concept = concept.strip('><')
m = re.search(r'wiki/(.*)$', concept)
if m:
concept = m.groups()[0]
else:
concept = ''
return concept
redirects = {}
with open('./cleaned_redirects.tsv') as fd:
for line in fd:
from_c, to_c = line.strip().split('\t')
redirects[from_c] = to_c
with open(input_filename) as inp:
with open(output_filename, 'w') as out:
for line in inp:
if line.count('\t') != 3:
continue
left_context, mention_text, mention_link, right_context = line.split('\t')
passed = True
passed &= check_mention(mention_text)
if len(left_context) + len(right_context) < 10:
passed = False
concept = convert_concept(mention_link)
concept = redirects.get(concept, concept)
if passed:
out.write("\t".join([concept, left_context, mention_text, right_context]))