-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathbigram_model.py
63 lines (48 loc) · 1.77 KB
/
bigram_model.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
"""
Description: Bigram model for Language identification
Authors: Diptanu Sarkar, [email protected], Saral Nyathawada, [email protected]
"""
# Importing python3 libraries for the project
import codecs
from nltk.collocations import BigramCollocationFinder
import math
import numpy as np
import csv
def main():
#Filename for training the data
filename = "wiki_data_10Kwords_allURLs.csv"
#Dictionary to store the data
data_dict = {}
#Read the file line by line
with codecs.open(filename,"r","utf-8") as f:
#Read and ignore the header
reader = csv.reader(f)
next(reader)
#For each language append the respective data
for line in reader:
language = line[1]
if language in data_dict:
data_dict[language].append(line[0])
else:
if language is "":
continue
data_dict[language] = []
#For each language get a character sequence list
for lang in data_dict:
text_corpus = "".join(data_dict[lang])
char_sequence = []
for char in text_corpus:
char_sequence.append(char)
#Use BigramCollocationFinder from nltk
finder = BigramCollocationFinder.from_words(char_sequence)
#Ignore bigrams which occur less than 5 times
finder.apply_freq_filter(5)
#Sort the model
bigram_model = sorted(finder.ngram_fd.items(), key=lambda item: item[1], reverse=True)
#Save the model
np.save(lang+".npy",bigram_model)
# The following condition checks whether we are
# running as a script, in which case run the code.
# If the file is being imported, don't run the code.
if __name__ == '__main__':
main()