-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMuddyUS_extract_pdf.py
45 lines (32 loc) · 1.4 KB
/
MuddyUS_extract_pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
#importing/reshaping CA county data from 2020 US Pres election
import tabula
import pandas as pd
file1="https://www.presidency.ucsb.edu/sites/default/files/election_data/ca-2020.pdf"
table = tabula.read_pdf(file1, pages="22-27")
table[0]
table1 = table
tab1 = pd.concat([table1[i] for i in range(3)], ignore_index=True)
tab2 = pd.concat([table1[i+3] for i in range(3)], ignore_index=True)
tab3 = pd.merge(tab1, tab2, on='Unnamed: 0')
tab3 = tab3.drop_duplicates()
tab3 = tab3.rename(columns={'Unnamed: 0': 'County'})
tab3 = tab3.query('`County` != "Percent"')
tab3 = tab3.query('`County` != "State Totals"')
party = tab3.iloc[0]
party = party[1:]
party = party.rename_axis('Candidate')
party = pd.DataFrame({'Party': party})
party = party.assign(Candidate=party.index)
party.index = [i for i in range(11)]
party = party[['Candidate', 'Party']]
tab3 = tab3.loc[1:]
tab3.index = [i for i in range(58)]
tab3.loc[38] = tab3.loc[38].fillna(value=0)
tab4 = tab3.melt(id_vars=['County'], var_name='Candidate', value_name='Votes')
tab5 = pd.merge(tab4, party)
tab5 = tab5[['County', 'Candidate', 'Party', 'Votes']]
tab5 = tab5.astype('string')
tab5['Votes'] = tab5['Votes'].str.replace(',', '').astype(int)
tab5 = tab5.sort_values(by=['County', 'Votes'], ascending = [True, False])
tab6 = tab5.groupby('County')['Votes'].sum().sort_values(ascending=False)
tab5.to_csv('~/home/nathan/Documents/MuddyCA2020.csv', index=False)