import duckplyr to merge household data

ipeaGIT · Jul 16, 2024 · 6b7c272 · 6b7c272
1 parent 0ad424a
commit 6b7c272
Show file tree

Hide file tree

Showing 8 changed files with 33 additions and 21 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -32,15 +32,14 @@ Imports:
     arrow (>= 15.0.1),
     checkmate,
     dplyr,
-    duckdb,
+    duckplyr,
     httr (>= 1.4.1),
     tools
 Suggests:
     covr,
     DBI,
     dbplyr,
     duckdb,
-    duckplyr,
     geobr,
     ggplot2 (>= 3.3.1),
     rmarkdown,

diff --git a/NEWS.md b/NEWS.md
@@ -1,9 +1,13 @@
 # censobr v0.3.29999 dev
 
-* Minor changes
+* Major changes
   * Some functions `read_population`, `read_mortality`, `read_families`, `read_emigration` now include a new parameter `merge_households` (logical) to indicate whether the function should merge household variables to the output data. Closes [#31](https://github.com/ipeaGIT/censobr/issues/31)
+  * {censobr} now imports the {duckplyr} package, which is used for merging hosuehold data in the issue #31.
   * New vignette showing how to work with larger-than-memory data. Closes [#42](https://github.com/ipeaGIT/censobr/issues/42)
 
+* Changes to data sets and files included in this version:
+  * Population microdata for the year 2000 now include a few columns that were not included before. Closes [#44](https://github.com/ipeaGIT/censobr/issues/44)
+  * Included additional columns and fixed minor error in data dictionary of 2010 microdata. Closes [#45](https://github.com/ipeaGIT/censobr/issues/45) 
 
 
 # censobr v0.3.2

diff --git a/R/merge_household.R b/R/merge_household.R
@@ -65,12 +65,11 @@ merge_household_var <- function(df,
   df <- arrow::to_duckdb(df)
   df_household <- arrow::to_duckdb(df_household)
 
-
   # merge
-  temp_df <- dplyr::left_join(df, df_household)
+  df_geo <- duckplyr::left_join(df, df_household)
 
   # back to arrow
-  temp_df <- arrow::to_arrow(temp_df)
+  df_geo <- arrow::to_arrow(df_geo)
 
-  return(temp_df)
+  return(df_geo)
 }
diff --git a/R/read_mortality.R b/R/read_mortality.R
@@ -72,9 +72,9 @@ read_mortality <- function(year = 2010,
   ### merge household data
   if (isTRUE(merge_households)) {
     df <- merge_household_var(df,
-                            year = year,
-                            add_labels = add_labels,
-                            showProgress)
+                              year = year,
+                              add_labels = add_labels,
+                              showProgress)
     }
 
   ### Select

diff --git a/tests/tests_rafa/merge_household_notes.R b/tests/tests_rafa/merge_household_notes.R
@@ -91,8 +91,16 @@ merge_household_var <- function(df, year, add_labels=NULL, showProgress=T){
   vars_to_drop <- setdiff(all_common_vars, key_vars)
   df_household <- dplyr::select(df_household, -all_of(vars_to_drop))
 
+  # convert to duckdb
+  df <- arrow::to_duckdb(df)
+  df_household <- arrow::to_duckdb(df_household)
+
   # merge
-  temp_df <- dplyr::left_join(df, df_household)
+  gf_geo <- duckplyr::left_join(df, df_household)
+
+  # back to arrow
+  gf_geo <- arrow::to_arrow(gf_geo)
+
 
-  return(temp_df)
+  return(gf_geo)
   }
diff --git a/tests/testthat/test_docs_interview_manual.R b/tests/testthat/test_docs_interview_manual.R
@@ -10,12 +10,12 @@ testthat::skip_on_cran()
 test_that("interview_manual", {
 
   # download files
-  testthat::expect_message( interview_manual(year = 2022) )
-  testthat::expect_message( interview_manual(year = 2010) )
-  testthat::expect_message( interview_manual(year = 2000) )
-  testthat::expect_message( interview_manual(year = 1991) )
-  testthat::expect_message( interview_manual(year = 1980) )
-  testthat::expect_message( interview_manual(year = 1970) )
+  testthat::expect_message( interview_manual(year = 2022, showProgress = FALSE) )
+  testthat::expect_message( interview_manual(year = 2010, showProgress = FALSE) )
+  testthat::expect_message( interview_manual(year = 2000, showProgress = FALSE) )
+  testthat::expect_message( interview_manual(year = 1991, showProgress = FALSE) )
+  testthat::expect_message( interview_manual(year = 1980, showProgress = FALSE) )
+  testthat::expect_message( interview_manual(year = 1970, showProgress = FALSE) )
 
   # cache dir
   pkgv <- paste0('censobr/data_release_', censobr_env$data_release)

diff --git a/tests/testthat/test_labels_households.R b/tests/testthat/test_labels_households.R
@@ -33,7 +33,7 @@ test_that("add_labels_households", {
 
   ################################################################### 2000
   # sem labels
-  test2a <- read_households(year = 2000, add_labels = NULL) |>
+  test2a <- read_households(year = 2000, add_labels = NULL, showProgress = FALSE) |>
     filter(abbrev_state == 'RO')
 
   # com labels

diff --git a/vignettes/censobr.Rmd b/vignettes/censobr.Rmd
@@ -201,13 +201,15 @@ In this final example, we're going to visualize how the amount of money people s
 
 First, let's download the municipalities of the metro area of São Paulo.
 ```{r warning = FALSE}
-metro_muni <- geobr::read_metro_area(year = 2010, showProgress = FALSE) |> 
+metro_muni <- geobr::read_metro_area(year = 2010, 
+                                     showProgress = FALSE) |> 
               subset(name_metro == "RM São Paulo")
 ```
 We also need the polygons of the weighting areas (áreas de ponderação). With the code below, we download all weighting areas in the state of São Paulo, and then keep only the ones in the metropolitan region of São Paulo.
 
 ```{r warning = FALSE}
-wt_areas <- geobr::read_weighting_area(code_weighting = "SP", showProgress = FALSE,
+wt_areas <- geobr::read_weighting_area(code_weighting = "SP", 
+                                       showProgress = FALSE,
                                        year = 2010)
 
 wt_areas <- subset(wt_areas, code_muni %in% metro_muni$code_muni)