-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathmanually_verify_links.py
251 lines (226 loc) · 8.87 KB
/
manually_verify_links.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
"""
This script opens candidate URLs for municipalities websites in the
web browser and asks the user to check whether or not they seem to be the
official city hall or city council portals.
Usage:
python manually_verify_links.py
For instructions use:
python manually_verify_links.py --help
Este script abre no navegador as URLs candidatas a sites dos municípios e
pede ao utilizador que verifique se elas parecem ser os portais das
prefeituras e câmaras municipais.
"""
import os
import argparse
from datetime import datetime
import logging
import random
import webbrowser
import pandas as pd
from validation.verify_links import (healthy_link, get_title_and_type,
get_candidate_links, get_output_to_be_merged, store_csv)
INPUT_FOLDER = '../../data/unverified'
INPUT_FILE = 'municipality-website-candidate-links.csv'
OUTPUT_FOLDER = '../../data/valid'
MAX_QUANTITY = 0
def parse_cli() -> dict:
"""Parses the command line interface.
Returns:
dict: A dict containing the values for input_folder, input_file,
data_package_path, max_quantity, max_simultaneous
"""
parser = argparse.ArgumentParser(
description='''Opens in a web browser candidate URLs for municipalities
websites and asks the user to checks if they seem to be the city hall or
city council portals.
Abre no navegador as URLs candidatas a sites dos municípios e pede ao
utilizador que verifique se elas parecem ser os portais das prefeituras e
câmaras municipais.
'''
)
parser.add_argument('input',
help='input file in CSV format / arquivo em formato CSV',
default='',
nargs='?',
)
parser.add_argument('output',
help=('output folder for the CSV '
'(must have a datapackage.json with a schema) / '
'pasta de saída para o CSV '
'(precisa ter um datapackage.json com um esquema)'),
default='',
nargs='?',
)
parser.add_argument('-q', '--quantity',
metavar='int', type=int,
help='maximum quantity of cities to process / quantidade máxima a processar',
default=0,
)
params = {}
args = parser.parse_args()
if args.input:
params['input_folder'] = os.path.dirname(args.input)
params['input_file'] = os.path.basename(args.input)
else: # use default values
params['input_folder'] = INPUT_FOLDER
params['input_file'] = INPUT_FILE
if args.output:
if not os.path.exists(args.output):
raise FileNotFoundError(f'Folder not found: {args.output}')
output_folder = args.output
else: # use default value
output_folder = OUTPUT_FOLDER
params['data_package_path'] = os.path.join(output_folder, 'datapackage.json')
if not os.path.exists(params['data_package_path']):
raise FileNotFoundError(
f'datapackage.json not found in folder: {args.output}')
if args.quantity:
params['max_quantity'] = args.quantity
else: # use default value
params['max_quantity'] = MAX_QUANTITY
return params
def choose(text: str, options: str) -> str:
"""Asks the user to choose one of the defined options.
Args:
text (str): The text prompt to be presented to the user.
options (str): A string containing one character for each option
the user can choose.
Returns:
str: The key containing one character representing what the user
has chosen.
"""
while True:
key = input(text).lower()
if key in options.lower():
break
return key
def verify_city_links(candidates, code):
'Verify links for a city with a given code.'
verified_links = []
signal = None
city_links = candidates[candidates.code == code]
name = city_links.name.iloc[0]
uf = city_links.uf.iloc[0]
print(f'Verifying candidate links for {name}, {uf}...')
for link in city_links.link.unique():
print(f'\n Checking link "{link}"...')
working_link = healthy_link(link)
if working_link:
print(f' Returned status code {working_link.status_code}')
title, link_type = get_title_and_type(
working_link,
candidates[candidates.link==link]
)
print(f' Title is: {title}.')
print(f' Most likely site type is: {link_type}')
if link_type == 'prefeitura':
branch = 'executive'
elif link_type == 'camara':
branch = 'legislative'
else:
branch = None
print(' Opening URL in browser...')
webbrowser.open(link)
choice = choose('''
What link type does this seem to be?
[P] Prefeitura (city hall)
[C] Câmara (city council)
[T] City hall transparency portal
[Y] City council transparency portal
[N] None of the above
[S] Skip
[Q] Quit
''', 'pctynsq')
# TODO: implement deletion if link type is none or broken
if choice in ['n','s']: # none or skip
continue
if choice == 'q': # quit
signal = 'q'
break
if choice == 'p': # prefeitura
branch = 'executive'
elif choice == 'c': # camara
branch = 'legislative'
# TODO: implement recording transparency portals
if branch is None:
# unable to determine branch and user did not select one,
# skip to next one in loop
print(' None of the above, ignoring link.')
continue
if branch == 'executive':
print(' Setting link as Prefeitura.')
elif branch == 'legislative':
print(' Setting link as Câmara.')
verified_link = {
'state_code': uf,
'municipality_code': code,
'municipality': name,
'sphere': 'municipal',
'branch': branch,
'url': working_link.url, # update if redirected
'last-verified-manual': datetime.utcnow()
}
verified_links.append(verified_link)
else:
print(' Error opening URL.')
return signal, verified_links
def manual_verify(input_folder: str, input_file: str, data_package_path: str,
max_quantity: int) -> pd.DataFrame:
"""Manually verifies links by opening each one of them on the browser
for the user to check. Then asks the user to classify the link type.
A Python function that does the same job as the script that is run
from the command line.
Args:
input_folder (str): The folder containing the input table.
input_file (str): Name of the file containing the input table in
csv format.
data_package_path (str): Path to the datapackage.json file.
max_quantity (int): Maximum quantity of links to check.
Returns:
pd.DataFrame: Pandas dataframe containing the verified links.
"""
candidates = get_candidate_links(
file_path=os.path.join(input_folder, input_file),
max_quantity=max_quantity)
codes = candidates.code.unique()
random.shuffle(codes)
results = []
print(f'Verifying candidate URLs for {max_quantity} cities...')
for code in codes:
signal, links_to_add = verify_city_links(candidates[candidates.code == code], code)
if signal == 'q':
print('Quitting...')
break
results.extend(links_to_add)
# read resource to be updated
table = get_output_to_be_merged(data_package_path)
print('Updating values...')
for result in results:
# get existing data in file to be updated
existing_data = table.loc[
(table.municipality_code == result['municipality_code']) &
(table.branch == result['branch'])
]
if len(existing_data) > 0:
index = existing_data.index[0]
row = existing_data.iloc[0].copy()
for key in ['sphere', 'branch', 'url', 'last-verified-manual']:
row[key] = result[key] # update only the new values
print(f'Updating {index} with {row}...')
table.loc[index] = row
else:
# add row to the end
print(f'Adding {result} to the end...')
table.loc[len(table)] = result
# remove duplicate entries,
# take into account only url column,
# keep last entry to preserve the last-verified-auto timestamp
table.drop_duplicates(subset='url', keep='last', inplace=True)
table.sort_values(by=['state_code', 'municipality'], inplace=True)
return table
if __name__ == '__main__':
logging.getLogger().setLevel(logging.INFO)
options = parse_cli()
table = manual_verify(**options)
print(f'Recording {options["data_package_path"]}...')
store_csv(table, options['data_package_path'])