forked from vishakha-lall/MapBot
-
Notifications
You must be signed in to change notification settings - Fork 0
/
featuresDump.py
90 lines (74 loc) · 2.03 KB
/
featuresDump.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# Use the features.py module to dump out features
# read in a CSV of sentences and bulk-dump to dump.csv of features
# Input CSV fmt: 1st field is sentence ID, 2nd field is text to process, 3rd field is class
import csv
import sys
import hashlib
from pathlib import Path
import features # features.py is bepoke util to extract NLTK POS features from sentences
import logging
log = logging.getLogger(__name__)
log.info("Entered module: %s" % __name__)
if len(sys.argv) > 1:
FNAME = Path(sys.argv[1])
else:
FNAME = Path("./analysis/sentences.csv")
logging.debug("reading input from ", FNAME)
if len(sys.argv) > 2:
FOUT = Path(sys.argv[2])
else:
FOUT = Path("./analysis/featuresDump.csv")
logging.debug("Writing output to ", FOUT)
fin = open(FNAME, "rt")
fout = open(FOUT, "wt", newline="")
keys = [
"id",
"wordCount",
"stemmedCount",
"stemmedEndNN",
"CD",
"NN",
"NNP",
"NNPS",
"NNS",
"PRP",
"VBG",
"VBZ",
"startTuple0",
"endTuple0",
"endTuple1",
"endTuple2",
"verbBeforeNoun",
"qMark",
"qVerbCombo",
"qTripleScore",
"sTripleScore",
"class",
]
reader = csv.reader(fin)
loopCount = 0
next(reader) # Assume we have a header
for line in reader:
sentence = line[0]
c = line[1] # class-label
id = hashlib.md5(str(sentence).encode("utf-8")).hexdigest()[:16]
# generate a unique ID
output = ""
header = ""
# get header and string output
# output, header = features.get_string(id,sentence,c)
f = features.features_dict(id, sentence, c)
for key in keys:
value = f[key]
header = header + ", " + key
output = output + ", " + str(value)
if loopCount == 0: # only extract and print header for first dict item
header = header[1:] # strip the first ","" off
logging.debug(header)
fout.writelines(header + "\n")
output = output[1:] # strip the first ","" off
loopCount = loopCount + 1
logging.debug(output)
fout.writelines(output + "\n")
fin.close()
fout.close()