-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsearch_public_opinions_by_keywords.py
58 lines (48 loc) · 2.13 KB
/
search_public_opinions_by_keywords.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
import sys, os
import scipy.sparse
import pandas as pd
from load_public_data import anes_opinion_data, anes_codebook
from find_public_opinion_question import find_anes_question
PATH = ".\\data\\tf-idf\\public_opinion\\"
#load tfidf matrix
TFIDF_MATRIX_FILENAME = "tfidf_matrix.npz"
TFIDF_MATRIX_PATH = os.path.join(PATH,TFIDF_MATRIX_FILENAME)
tfidf_matrix = scipy.sparse.load_npz(TFIDF_MATRIX_PATH)
#load {row index : opinion id} mapping
OPINION_ID_FILENAME = "tfidf_rows.csv"
OPINION_ID_PATH = os.path.join(PATH,OPINION_ID_FILENAME)
opin_id_df = pd.read_csv(OPINION_ID_PATH)
opin_id = list(opin_id_df['0'])
#load {col index : vocab} mapping
VOCAB_FILENAME = "tfidf_cols.csv"
VOCAB_PATH = os.path.join(PATH,VOCAB_FILENAME)
vocab_df = pd.read_csv(VOCAB_PATH)
vocab = list(vocab_df['0'])
#find list of relevant cases given set of keywords
#returns list of opinion_ids
def relevant_questions_by_vcf_code(keywords):
keyword_ind = [vocab.index(keyword) for keyword in keywords if keyword in vocab]
#score each opinion (row) based on keywords appearing
#by summing tfidf scores in the relevant columns
num_questions = tfidf_matrix.shape[0]
scores = []
for row in range(num_questions):
score = sum(tfidf_matrix[row,ind] for ind in keyword_ind)
if score != 0:
scores.append( (row,score) )
#sort scores by score value, in descending order
scores.sort(key=lambda tup: tup[1],reverse=True)
#get relevant questions
rel_questions = [opin_id[case[0]] for case in scores[:10]]
return rel_questions
#given keywords, look up relevant questions by searching question text
#and return ANES codebook sub-dataframe
def relevant_questions_anes_df(keywords):
vcf_codes = relevant_questions_by_vcf_code(keywords) #get opinion ids
anes_codebook_df = anes_codebook() #get scdb dataframe
rel_questions = [find_anes_question(vcf_code,anes_codebook_df) for vcf_code in vcf_codes] #get list of scdb cases
return pd.concat(rel_questions) #concatenate into single df and return
if __name__ == "__main__":
if(len(sys.argv) > 1):
keywords = sys.argv[1:]
print(relevant_questions_anes_df(keywords))