-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathindex.py
115 lines (103 loc) · 3.02 KB
/
index.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
# Simple extended boolean search engine: indexer based on cranfield format
# Hussein Suleman
# 21 April 2016
import os
import re
import sys
import porter
import parameters
# check parameter for collection name
if len(sys.argv)==1:
print ("Syntax: index.py <collection>")
exit(0)
collection = sys.argv[1]
# read and parse input data - extract words, identifiers and titles
f = open (collection, "r", encoding='utf-8')
identifier = ''
document = ''
title = ''
indocument = False
intitle = False
data = {}
titles = {}
for line in f:
mo = re.match (r'\.I ([0-9]+)', line)
if mo:
if document!='':
data[identifier] = document
identifier = mo.group (1)
indoc = False
else:
mo = re.match (r'\.T', line)
if mo:
title = ''
intitle = True
else:
mo = re.match (r'\.W', line)
if mo:
document = ''
indoc = True
else:
if intitle:
intitle = False
if identifier!='':
titles[identifier] = line[:-1][:50]
elif indoc:
document += " "
if parameters.case_folding:
document += line.lower()
else:
document += line
f.close ()
# document length/title file
g = open (collection + "_index_len", "w", encoding='utf-8')
# create inverted files in memory and save titles/N to file
index = {}
N = len (data.keys())
p = porter.PorterStemmer ()
for key in data:
content = re.sub (r'[^ a-zA-Z0-9]', ' ', data[key])
content = re.sub (r'\s+', ' ', content)
words = content.split (' ')
doc_length = 0
for word in words:
if word != '':
if parameters.stemming:
word = p.stem (word, 0, len(word)-1)
doc_length += 1
if not word in index:
index[word] = {key:1}
else:
if not key in index[word]:
index[word][key] = 1
else:
index[word][key] += 1
print (key, doc_length, titles[key], sep=':', file=g)
# document length/title file
g.close ()
# carry out generation of files listing stemmed words for each ID
try:
os.mkdir(collection+"_index_stem_count")
except:
pass
# write stem counts to file per key id
for stemmed_word in index:
for file_id in index[stemmed_word]:
# append the stem count to the respective ID file
f = open (collection+"_index_stem_count/"+file_id, "a", encoding='utf-8')
print(stemmed_word, index[stemmed_word][file_id], sep=':', file=f)
f.close()
# write inverted index to files
try:
os.mkdir (collection+"_index")
except:
pass
for key in index:
f = open (collection+"_index/"+key, "w", encoding='utf-8')
for entry in index[key]:
print (entry, index[key][entry], sep=':', file=f)
f.close ()
# write N
f = open (collection+"_index_N", "w", encoding='utf-8')
print (N, file=f)
f.close ()