-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathGrams.py
183 lines (139 loc) · 7.43 KB
/
Grams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
import csv
import sys
import copy
import operator
import random
csv.field_size_limit(sys.maxsize)
class NGram:
__lst_wordCorpus = "" # The entire word corpus
__lst_wordCorpusSet = [] # The set of the word corpus
__int_corpusLength = 0
__int_corpusLengthSet = 0
__lstmap_gramProbabilities = [] # array of maps of gram probabilities ie lst_gramProbabilities[0] returns unigram prob
__lstmap_gramCounts = [] # same as above but with counts
__lstlst_orderedProbLst = []
__lstlst_orderedCountLst = []
def __init__(self, lst_word_corpus=[""]):
self.__lst_wordCorpus = lst_word_corpus
self.__lst_wordCorpusSet = set(lst_word_corpus)
self.__int_corpusLength = len(lst_word_corpus)
self.__int_corpusLengthSet = len(self.__lst_wordCorpusSet)
# ----------------------------Simple Accessors------------------------------------------------#
# returns a list of maps
def get_gram_probabilities(self):
return self.__lstmap_gramProbabilities
# returns a list of maps
def get_gram_counts(self):
return self.__lstmap_gramCounts
def get_corpus_length(self):
return self.__int_corpusLength
def get_corpus_set_length(self):
return self.__int_corpusLengthSet
# These are index base 1 for consistency with length of n-gram
# Returns the Ngram (probability or count) of the given integer
def get_gram_probability(self, int_gram_index):
if len(self.__lstmap_gramProbabilities) < int_gram_index:
self.calculate(int_gram_index)
return self.__lstmap_gramProbabilities[int_gram_index - 1]
def get_gram_count(self, int_gram_index):
if len(self.__lstmap_gramCounts) < int_gram_index:
self.calculate(int_gram_index)
return self.__lstmap_gramCounts[int_gram_index - 1]
# returns a sorted list of pairs from the corresponding probability map
def get_gram_probability_ordered(self, int_gram_index):
if len(self.__lstmap_gramProbabilities) < int_gram_index:
self.calculate(int_gram_index)
return self.__lstlst_orderedProbLstLst[int_gram_index - 1]
# returns a sorted list of pairs from the corresponding probability map
def get_gram_count_ordered(self, int_gram_index):
if len(self.__lstmap_gramCounts) < int_gram_index:
self.calculate(int_gram_index)
return self.__lstlst_orderedCountLst[int_gram_index - 1]
# ----------------------------Simple Mutators------------------------------------------------#
def set_text(self, lst_word_corpus):
self.__lst_wordCorpus = lst_word_corpus
self.__lst_wordCorpusSet = set(lst_word_corpus)
self.__int_corpusLength = len(lst_word_corpus)
self.__int_corpusLengthSet = len(self.__lst_wordCorpusSet)
# ---------------------------Private helper method for n gram count--------------------------#
def __ngram_count(self, int_ngram_length):
# number of grams in a given text in respect to gram_order
def numb_grams():
return self.__int_corpusLength - int_ngram_length
# get the specific gram at a given text and gram length
def gram_at(index, gram_index=int_ngram_length): # gram index can also be interperted as gram_length
if gram_index > 1:
return gram_at(index, gram_index - 1) + " " + self.__lst_wordCorpus[index + gram_index]
else:
return self.__lst_wordCorpus[index]
# ---- Actual Function Implementation ---- #
map_word_count = {}
# count the number of times a gram occurs
for i in range(0, numb_grams()):
if gram_at(i) in map_word_count:
map_word_count[gram_at(i)] += 1
else:
map_word_count[gram_at(i)] = 1
self.__lstmap_gramCounts.append(map_word_count)
# ---- function to calculate the probabilities of all grams of given length in a text ---- #
# ---- Also calculates all counts and probabilities of the n-grams of length less than the given Length ----#
# ---- DO NOT GIVE A USER VALUE FOR INT_MIN_COUNT
def calculate(self, int_ngram_length, int_min_count=0):
# clear everything
self.__lstmap_gramCounts = []
self.__lstmap_gramProbabilities = []
self.__lstlst_orderedCountLst = []
self.__lstlst_orderedProbLst = []
# calculate the count of each gram (IE 3 calculates unigram bigram and trigram)
for i in range(1, int_ngram_length + 1):
self.__ngram_count(i)
if int_min_count > 0:
lstmap_thresholded = []
for _map in self.__lstmap_gramCounts:
lstmap_thresholded.append({k : v for k,v in _map.items() if v >= int_min_count})
self.__lstmap_gramCounts = lstmap_thresholded
print(self.__lstmap_gramCounts)
for lstmap in self.__lstmap_gramCounts:
for k, v in lstmap.items():
if lstmap[k] < int_min_count:
del lstmap[k]
# copy to probabilities
self.__lstmap_gramProbabilities = copy.deepcopy(self.__lstmap_gramCounts)
# calculate the probabilities of a unigram
for k, v in self.__lstmap_gramProbabilities[0].items():
self.__lstmap_gramProbabilities[0][k] = v / self.__int_corpusLength # divide by corpus length to get unigram prob
# calculate the remaining probabilities of the ngrams
for i in range(1, len(self.__lstmap_gramProbabilities)):
for k, count in self.__lstmap_gramProbabilities[i].items():
self.__lstmap_gramProbabilities[i][k] = count / self.__lstmap_gramCounts[i - 1][" ".join(k.split(' ')[0:i])]
# for each gram -sort the count and probability lists and store
for n_gram in self.__lstmap_gramProbabilities:
self.__lstlst_orderedProbLst.append(sorted(n_gram.items(), key=operator.itemgetter(1)))
for n_gram in self.__lstmap_gramCounts:
self.__lstlst_orderedCountLst.append(sorted(n_gram.items(), key=operator.itemgetter(1)))
return self.__lstmap_gramProbabilities
def random_sentence_base(self, int_gram):
# returns a random subphrase in the corpus
if int_gram > len(self.__lstmap_gramCounts):
self.calculate(int_gram)
sz = len(self.__lstmap_gramCounts[int_gram - 1])
base = self.__lstlst_orderedCountLst[int_gram - 1][random.randint(0, sz)][0]
return ' '.join(base.split(' ')[0:-1]) # get a base phrase the size of the gram - 1
def random_sentence_next(self, curr_sentence, int_gram):
def pair_list_sum(pair_lst):
return sum([pair[1] for pair in pair_lst])
def last(str_):
return str_.split(' ')[-1]
def get_random_word(pair_lst):
random_position = random.random() * pair_list_sum(pair_lst)
flt_pos = 0
for i in range(0, len(pair_lst)):
if i + 1 == len(pair_lst):
return pair_lst[i][0]
if (flt_pos >= random_position) and (flt_pos <= random_position + pair_lst[i + 1][1]):
return pair_lst[i][0]
flt_pos += pair_lst[i][1]
return ' '
relevant_sent = ' '.join(curr_sentence.split(' ')[-int_gram:])
possible_words = [(last(k), v) for k, v in self.__lstlst_orderedCountLst[int_gram - 1] if k[0:len(relevant_sent)] == relevant_sent]
return get_random_word(possible_words)