forked from fernandezpablo85/sysarmy-salaries-data-2016
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsysarmy.r
104 lines (87 loc) · 3.54 KB
/
sysarmy.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
library(ggplot2)
df <- read.csv('sueldos.sysarmy.csv')
rename <- function(df, old, wants) {
names(df)[names(df)==old] <- wants
return(df)
}
tukey <- function(data) {
iqr <- IQR(data$Income)
firstQ <- quantile(data$Income)[2]
thirdQ <- quantile(data$Income)[4]
low <- firstQ - (iqr * 1.5)
high <- thirdQ + (iqr * 1.5)
data <- subset(data, Income < high)
data <- subset(data, Income > low)
return(data)
}
cleanup <- function(data, handleOutliers) {
# clean gender.
data$Gender = ifelse(data$Soy == "Hombre", "M", "F")
data$Gender = as.factor(data$Gender)
# rename columns.
data <- rename(data, "Tengo", "Age")
data <- rename(data, "Argentina", "Region")
data <- rename(data, "A..os.de.experiencia", "YearsExperience")
data <- rename(data, "A..os.en.el.puesto.actual", "YearsCurrentJob")
data <- rename(data, "Trabajo.de", "JobDescription")
data <- rename(data, "Tipo.de.contrato", "JobType")
data <- rename(data, "Qu...tan.conforme.est..s.con.tu.sueldo.", "Happiness")
data <- rename(data, "Cambiaste.de.empresa.en.los...ltimos.6.meses.", "SwitchedJobsLast6Months")
# fix region names.
levels(data$Region)[levels(data$Region) == "Entre R\303\255os"] <- "Entre Rios"
levels(data$Region)[levels(data$Region) == "Ciudad Aut\303\263noma de Buenos Aires"] <- "CABA"
levels(data$Region)[levels(data$Region) == "C\303\263rdoba"] <- "Cordoba"
levels(data$Region)[levels(data$Region) == "Neuqu\303\251n"] <- "Neuquen"
levels(data$Region)[levels(data$Region) == "R\303\255o Negro"] <- "Rio Negro"
levels(data$Region)[levels(data$Region) == "Tucum\303\241n"] <- "Tucuman"
levels(data$Region)[levels(data$Region) == "Provincia de Buenos Aires"] <- "GBA"
# fix age.
levels(data$Age)[levels(data$Age) == "Menos de 18 a\303\261os"] <- "18-"
# fix salary.
data <- rename(data, "Salario.mensual..en.tu.moneda.local.", "Income")
data$Income <- ifelse(data$Bruto.o.neto. == "Bruto", data$Income, data$Income/0.70)
data$Bruto.o.neto. = NULL
# fix job switch.
data$SwitchedJobsLast6Months = ifelse(data$SwitchedJobsLast6Months == "No", 0, 1)
# remove ficticious data.
data <- subset(data, Income < 200000)
data <- subset(data, Income > 1000)
# handle outliers.
data <- handleOutliers(data)
keep <- c("Age", "Region", "YearsExperience", "YearsCurrentJob", "JobDescription",
"JobType", "Happiness", "Income", "Gender", "SwitchedJobsLast6Months")
return(data[keep])
}
all.salaries.hist <- function(df) {
plot <- ggplot(df, aes(x=Income), ylab="") +
geom_histogram(binwidth = 1000, fill="#3399FF", alpha=0.9)
return(plot)
}
all.salaries.hist.median <- function(df) {
plot <- all.salaries.hist(df) +
geom_vline(aes(xintercept = mean(Income)), linetype="longdash", color="red")
return(plot)
}
all.salaries.gender <- function(df) {
plot <- ggplot(df, aes(x=Income, fill=Gender), ylab="") +
geom_histogram(binwidth = 1000, alpha=0.9)
return(plot)
}
color.outliers <- function(df) {
iqr <- IQR(df$Income)
firstQ <- quantile(df$Income)[2]
thirdQ <- quantile(df$Income)[4]
low <- firstQ - (iqr * 1.5)
high <- thirdQ + (iqr * 1.5)
df$OutlierTag = "Middle"
df$OutlierTag[df$Income <= low] = "LowOutliers"
df$OutlierTag[df$Income >= high] = "HighOutliers"
plot <- ggplot(df, aes(x=Income, fill=OutlierTag)) +
geom_histogram(binwidth = 1000) +
geom_vline(aes(xintercept = high), linetype="longdash", color="red")
return(plot)
}
clean <- cleanup(df, handleOutliers = identity)
write.csv(clean, 'clean.csv', row.names=FALSE)
default.plot <- color.outliers
default.plot(clean)