From 940c83804fb7606365e95c3ca8c9f876793ce9ad Mon Sep 17 00:00:00 2001 From: rafapereirabr Date: Tue, 16 Jul 2024 11:46:14 -0300 Subject: [PATCH] update microdata_sample_2000 households --- data_prep/R/add_geography_cols.R | 2 +- data_prep/R/microdata_sample_2000.R | 16 ++++++++-------- .../read_guides/readguide_2000_households.csv | 2 +- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/data_prep/R/add_geography_cols.R b/data_prep/R/add_geography_cols.R index c6b353c..6ba7f55 100644 --- a/data_prep/R/add_geography_cols.R +++ b/data_prep/R/add_geography_cols.R @@ -14,7 +14,7 @@ add_geography_cols <- function(arrw, year){ year == 1991 ~ 'code_muni', year == 2010 ~ 'V0002', year == 2022 ~ 'CD_MUN', - year == 2000 ~ 'V1103') + year == 2000 ~ 'V0103') if(year %in% c(2000, 2010)){ arrw <- mutate(arrw, diff --git a/data_prep/R/microdata_sample_2000.R b/data_prep/R/microdata_sample_2000.R index 148972f..38cc3e8 100644 --- a/data_prep/R/microdata_sample_2000.R +++ b/data_prep/R/microdata_sample_2000.R @@ -33,7 +33,7 @@ lapply(X=txt_files, # list all files data_dir <- 'R:/Dropbox/bases_de_dados/censo_demografico/censo_2000/microdados_txt' - txt_files <- list.files(data_dir, + txt_files <- list.files(path = data_dir, pattern = 'Dom[[:digit:]]|DOM', recursive = TRUE, full.names = TRUE) @@ -49,23 +49,23 @@ lapply(X=txt_files, # Define the dataset DS <- arrow::open_dataset(sources = parqt_files) - # Create a scanner - SO <- Scanner$create(DS) - # Load it as n Arrow Table in memory - AT <- SO$ToTable() - rm(DS, SO); gc(T) + # # Create a scanner + # SO <- Scanner$create(DS) + # # Load it as n Arrow Table in memory + # AT <- SO$ToTable() + # rm(DS, SO); gc(T) ## 3.3) add geography variables ---------------------------------------------- # # drop row if all columns are NA - AT <- filter(AT, !is.na(PESO_DOMIC)) + AT <- filter(DS, !is.na(P001)) AT <- add_geography_cols(arrw = AT, year = 2000) head(AT) |> collect() ## 3.4) save single parquet tile ---------------------------------------------- - arrow::write_parquet(AT, './data/microdata_sample/2000/2000_households.parquet') + arrow::write_parquet(AT, './data/microdata_sample/2000/2000_households2.parquet') diff --git a/data_prep/read_guides/readguide_2000_households.csv b/data_prep/read_guides/readguide_2000_households.csv index e693427..dd0ee07 100644 --- a/data_prep/read_guides/readguide_2000_households.csv +++ b/data_prep/read_guides/readguide_2000_households.csv @@ -75,7 +75,7 @@ int_pos;var_name;x;label;length;decimal_places;fin_pos;col_type;CHAR 143;V7409;2.;;2;0;144;i;FALSE 145;V7616;6.;;6;0;150;i;FALSE 151;V7617;6.2;;6;2;156;d;FALSE -157;PESO_DOMIC;11.8;;11;8;167;d;FALSE +157;P001;11.8;;11;8;167;d;FALSE 168;v1111;$1.;;1;0;168;c;TRUE 169;v1112;$1.;;1;0;169;c;TRUE 170;v1113;$1.;;1;0;170;c;TRUE