From 940c83804fb7606365e95c3ca8c9f876793ce9ad Mon Sep 17 00:00:00 2001
From: rafapereirabr <rafa.pereira.br@gmail.com>
Date: Tue, 16 Jul 2024 11:46:14 -0300
Subject: [PATCH] update microdata_sample_2000 households

---
 data_prep/R/add_geography_cols.R                 |  2 +-
 data_prep/R/microdata_sample_2000.R              | 16 ++++++++--------
 .../read_guides/readguide_2000_households.csv    |  2 +-
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/data_prep/R/add_geography_cols.R b/data_prep/R/add_geography_cols.R
index c6b353c..6ba7f55 100644
--- a/data_prep/R/add_geography_cols.R
+++ b/data_prep/R/add_geography_cols.R
@@ -14,7 +14,7 @@ add_geography_cols <- function(arrw, year){
                    year == 1991 ~ 'code_muni',
                    year == 2010 ~ 'V0002',
                    year == 2022 ~ 'CD_MUN',
-                   year == 2000 ~ 'V1103')
+                   year == 2000 ~ 'V0103')
 
   if(year %in% c(2000, 2010)){
     arrw <- mutate(arrw,
diff --git a/data_prep/R/microdata_sample_2000.R b/data_prep/R/microdata_sample_2000.R
index 148972f..38cc3e8 100644
--- a/data_prep/R/microdata_sample_2000.R
+++ b/data_prep/R/microdata_sample_2000.R
@@ -33,7 +33,7 @@ lapply(X=txt_files,
 
   # list all files
   data_dir <- 'R:/Dropbox/bases_de_dados/censo_demografico/censo_2000/microdados_txt'
-  txt_files <- list.files(data_dir,
+  txt_files <- list.files(path = data_dir,
                           pattern = 'Dom[[:digit:]]|DOM',
                           recursive = TRUE,
                           full.names = TRUE)
@@ -49,23 +49,23 @@ lapply(X=txt_files,
 
   # Define the dataset
   DS <- arrow::open_dataset(sources = parqt_files)
-  # Create a scanner
-  SO <- Scanner$create(DS)
-  # Load it as n Arrow Table in memory
-  AT <- SO$ToTable()
-  rm(DS, SO); gc(T)
+  # # Create a scanner
+  # SO <- Scanner$create(DS)
+  # # Load it as n Arrow Table in memory
+  # AT <- SO$ToTable()
+  # rm(DS, SO); gc(T)
 
   ## 3.3) add geography variables ----------------------------------------------
 
   # # drop row if all columns are NA
-  AT <- filter(AT, !is.na(PESO_DOMIC))
+  AT <- filter(DS, !is.na(P001))
 
   AT <- add_geography_cols(arrw = AT, year = 2000)
 
   head(AT) |> collect()
 
   ## 3.4) save single parquet tile ----------------------------------------------
-  arrow::write_parquet(AT, './data/microdata_sample/2000/2000_households.parquet')
+  arrow::write_parquet(AT, './data/microdata_sample/2000/2000_households2.parquet')
 
 
 
diff --git a/data_prep/read_guides/readguide_2000_households.csv b/data_prep/read_guides/readguide_2000_households.csv
index e693427..dd0ee07 100644
--- a/data_prep/read_guides/readguide_2000_households.csv
+++ b/data_prep/read_guides/readguide_2000_households.csv
@@ -75,7 +75,7 @@ int_pos;var_name;x;label;length;decimal_places;fin_pos;col_type;CHAR
 143;V7409;2.;;2;0;144;i;FALSE
 145;V7616;6.;;6;0;150;i;FALSE
 151;V7617;6.2;;6;2;156;d;FALSE
-157;PESO_DOMIC;11.8;;11;8;167;d;FALSE
+157;P001;11.8;;11;8;167;d;FALSE
 168;v1111;$1.;;1;0;168;c;TRUE
 169;v1112;$1.;;1;0;169;c;TRUE
 170;v1113;$1.;;1;0;170;c;TRUE