-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProjectRun.py
99 lines (76 loc) · 3.22 KB
/
ProjectRun.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import nltk
import re
import string
import random
import csv
import sys
import copy
import operator
import datetime
from Grams import *
# convert all columns in a given list from all files given into a single value
def file_to_string(column_lst, *filenames):
str_combined = ""
for fname in filenames:
with open(fname) as csvfile:
article = csv.reader(csvfile)
for row in article:
str_combined += '\n'
for i in column_lst:
str_combined += row[i]
return str_combined
# removes punctuation and lowercases
def normalize(str_data):
str_data = str_data.lower()
return str_data
# returns a list from a list of pairs with the value being at least min
def minimum_rep_lst(lst_pair_wordprob, min_reps):
return [pair for pair in lst_pair_wordprob if pair[1] >= min_reps]
def random_sentence(gram, order=1, sentence_length=10):
sentence = gram.random_sentence_base(order) + ' '
concat = ""
for i in range(0, sentence_length):
concat = gram.random_sentence_next(sentence, order) + " "
sentence += concat
return sentence
def print_core_data(ngram):
print("CorpusLength: ", ngram.get_corpus_length())
print("Total vocab: ", ngram.get_corpus_set_length())
def print_data(ngram, int_gram_order):
sentence_length = 10
orderedP = ngram.get_gram_probability_ordered(int_gram_order)
orderedC = ngram.get_gram_count_ordered(int_gram_order)
print("\n------------Probability and Count of Gram (top 20)", int_gram_order, "------------")
print(orderedP[0:15])
print(orderedC[0:15])
print("\n-------------Generated Sentences (max length=", sentence_length, ")")
print("SENTENCE1 ", random_sentence(ngram, int_gram_order, sentence_length))
print("SENTENCE2 ", random_sentence(ngram, int_gram_order, sentence_length))
print("SENTENCE3 ", random_sentence(ngram, int_gram_order, sentence_length))
def run_project(str_filename, lst_columns, mincount=0):
str_data = file_to_string(str_filename, lst_columns)
print("Parsing...")
news_text = normalize(str_data)
tokenized = nltk.word_tokenize(news_text)
print("\n")
print("------------------------Calculating nGrams------------------------")
gram = NGram(tokenized)
print_core_data(gram)
gram.calculate(3, mincount) # calculate the ngrams for unigram,bigram,trigram (1-3) with a minimum repeats
print_data(gram, 3)
print_data(gram, 2)
print_data(gram, 1)
return gram
NEWS_COL_INDEX = [9]
WINE_COL_INDEX = [1]
# run_project (param1 = a list of columns to read, param2= filepath for the CSV)
newsGram1 = run_project(NEWS_COL_INDEX, "/home/joseph/Documents/NLP_Grams/all-the-news/articles1.csv", 20)
del newsGram1
newsGram2 = run_project(NEWS_COL_INDEX, "/home/joseph/Documents/NLP_Grams/all-the-news/articles2.csv", 20)
del newsGram2
newsGram3 = run_project(NEWS_COL_INDEX, "/home/joseph/Documents/NLP_Grams/all-the-news/articles3.csv", 20)
del newsGram3
wineGram1 = run_project(NEWS_COL_INDEX, "/home/joseph/Documents/NLP_Grams/wine-reviews/winemag-data-130k-v2.csv", 20)
del wineGram1
wineGram2 = run_project(NEWS_COL_INDEX, "/home/joseph/Documents/NLP_Grams/wine-reviews/winemag-data_first150k.csv", 20)
del wineGram2