International Survey Compatibility
Source:vignettes/international-surveys.Rmd
international-surveys.RmdIntroduction
metasurvey’s Survey class and step pipeline
(step_compute, step_recode,
step_rename, step_remove,
step_join) are survey-agnostic: they work with any tabular
data. This vignette demonstrates compatibility with 7 household surveys
from 6 countries, using real bundled data where possible.
For each survey we show the full flow: load data, create a
Survey, apply steps, estimate with workflow(),
and package the pipeline as a Recipe.
Compatibility matrix
| Feature | ECH | EPH | CASEN | PNADc | CPS | ENIGH | DHS |
|---|---|---|---|---|---|---|---|
step_compute |
✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
step_recode |
✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
step_rename |
✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
step_remove |
✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
add_weight |
✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
strata |
N/A | N/A | ✓ | ✓ | N/A | ✓ | ✓ |
psu |
N/A | N/A | ✓ | ✓ | N/A | ✓ | ✓ |
add_replicate |
✓ | N/A | N/A | ✓ | ✓ | N/A | N/A |
workflow |
✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
Recipe |
✓ | ✓ | ✓ | ✓ | ✓ | ✓ | ✓ |
Installing companion packages
Most packages are on CRAN. The casen package is only
available from GitHub. Sections for unavailable packages are skipped
automatically. Install them manually to see all examples:
# CRAN packages
install.packages(c("eph", "PNADcIBGE", "ipumsr", "rdhs"))
# GitHub-only packages
# install.packages("remotes")
remotes::install_github("pachadotdev/casen")ECH – Uruguay
The ECH (Encuesta Continua de Hogares) is the primary survey metasurvey was built for. A bundled sample is included in the package.
library(metasurvey)
library(data.table)
dt_ech <- fread(
system.file("extdata", "ech_2023_sample.csv", package = "metasurvey")
)
svy_ech <- Survey$new(
data = dt_ech,
edition = "2023",
type = "ech",
psu = NULL,
engine = "data.table",
weight = add_weight(annual = "W_ANO")
)
svy_ech <- svy_ech |>
step_recode(labor_status,
POBPCOAC == 2 ~ "Employed",
POBPCOAC %in% 3:5 ~ "Unemployed",
POBPCOAC %in% c(6:10, 1) ~ "Inactive or under 14",
comment = "ILO labor force status"
) |>
step_compute(
income_pc = HT11 / nper,
comment = "Per capita household income"
) |>
bake_steps()
workflow(
list(svy_ech),
survey::svymean(~HT11, na.rm = TRUE),
estimation_type = "annual"
)
#> stat value se cv confint_lower
#> <char> <num> <num> <num> <num>
#> 1: survey::svymean: HT11 107869.1 3473.836 0.03220417 101060.5
#> confint_upper
#> <num>
#> 1: 114677.7For the complete ECH pipeline, see
vignette("ech-case-study").
EPH – Argentina
The EPH (Encuesta Permanente de Hogares) is Argentina’s quarterly
labor force survey. The eph package (CRAN, GitHub) includes a bundled
toybase.
library(eph)
data("toybase_individual_2016_04", package = "eph")
dt_eph <- data.table(toybase_individual_2016_04)
svy_eph <- Survey$new(
data = dt_eph,
edition = "201604",
type = "eph",
psu = NULL,
engine = "data.table",
weight = add_weight(quarterly = "PONDERA")
)
svy_eph <- svy_eph |>
step_recode(labor_status,
ESTADO == 1 ~ "Employed",
ESTADO == 2 ~ "Unemployed",
ESTADO == 3 ~ "Inactive",
.default = NA_character_,
comment = "Labor force status (INDEC)"
) |>
step_recode(sex,
CH04 == 1 ~ "Male",
CH04 == 2 ~ "Female",
.default = NA_character_,
comment = "Sex from CH04"
) |>
step_compute(
employed = ifelse(ESTADO == 1, 1L, 0L),
comment = "Employment indicator"
) |>
bake_steps()
# Employment rate
workflow(
list(svy_eph),
survey::svymean(~employed, na.rm = TRUE),
estimation_type = "quarterly"
)
#> stat value se cv confint_lower
#> <char> <num> <num> <num> <num>
#> 1: survey::svymean: employed 0.4378169 0.01668347 0.03810603 0.4051179
#> confint_upper
#> <num>
#> 1: 0.4705159To download actual EPH microdata use
eph::get_microdata(year = 2023, period = 3, type = "individual").
CASEN – Chile
The CASEN (Encuesta de Caracterizacion Socioeconomica Nacional) is
Chile’s main socioeconomic survey. It uses stratified cluster sampling.
The casen package (GitHub) includes a
bundled sample from the Los Rios region.
Install with
remotes::install_github("pachadotdev/casen").
# Requires: remotes::install_github("pachadotdev/casen")
data("casen_2017_los_rios")
dt_casen <- data.table(casen_2017_los_rios)
svy_casen <- Survey$new(
data = dt_casen,
edition = "2017",
type = "casen",
psu = "varunit",
strata = "varstrat",
engine = "data.table",
weight = add_weight(annual = "expc")
)
svy_casen <- svy_casen |>
step_recode(sex,
sexo == 1 ~ "Male",
sexo == 2 ~ "Female",
.default = NA_character_,
comment = "Sex (MDS codebook)"
) |>
step_recode(poverty_status,
pobreza == 1 ~ "Extreme poverty",
pobreza == 2 ~ "Non-extreme poverty",
pobreza == 3 ~ "Not poor",
.default = NA_character_,
comment = "Poverty status"
) |>
step_compute(
log_income = log(ytotcorh + 1),
comment = "Log household income"
) |>
bake_steps()
# Mean household income
workflow(
list(svy_casen),
survey::svymean(~ytotcorh, na.rm = TRUE),
estimation_type = "annual"
)For the full CASEN microdata, use
descargar_casen_github(2017, tempdir()) from the casen
package.
PNADc – Brazil
The PNADc (Pesquisa Nacional por Amostra de Domicilios Continua) is
Brazil’s quarterly labor force survey with stratified cluster sampling.
The PNADcIBGE package (CRAN) includes
bundled example microdata.
library(PNADcIBGE)
dt_pnadc <- data.table(read_pnadc(
microdata = system.file("extdata", "exampledata.txt", package = "PNADcIBGE"),
input_txt = system.file("extdata", "input_example.txt", package = "PNADcIBGE")
))
svy_pnadc <- Survey$new(
data = dt_pnadc,
edition = "202301",
type = "pnadc",
psu = "UPA",
strata = "Estrato",
engine = "data.table",
weight = add_weight(quarterly = "V1028")
)
svy_pnadc <- svy_pnadc |>
step_recode(sex,
V2007 == 1 ~ "Male",
V2007 == 2 ~ "Female",
.default = NA_character_,
comment = "Sex (V2007)"
) |>
step_compute(
age = as.integer(V2009),
comment = "Age in years"
) |>
bake_steps()
workflow(
list(svy_pnadc),
survey::svymean(~age, na.rm = TRUE),
estimation_type = "quarterly"
)
#> stat value se cv confint_lower
#> <char> <num> <num> <num> <num>
#> 1: survey::svymean: age 35.55343 0.856023 0.02407708 33.87566
#> confint_upper
#> <num>
#> 1: 37.23121For real PNADc microdata:
PNADcIBGE::get_pnadc(year = 2023, quarter = 1). The
download is approximately 200 MB.
CPS – United States
The CPS (Current Population Survey) is the US monthly labor force
survey. The ipumsr package (CRAN, GitHub) includes a bundled
CPS extract.
library(ipumsr)
ddi <- read_ipums_ddi(
system.file("extdata", "cps_00160.xml", package = "ipumsr")
)
dt_cps <- data.table(read_ipums_micro(ddi, verbose = FALSE))
svy_cps <- Survey$new(
data = dt_cps,
edition = "2011",
type = "cps",
psu = NULL,
engine = "data.table",
weight = add_weight(annual = "ASECWT")
)
svy_cps <- svy_cps |>
step_recode(health_status,
HEALTH == 1 ~ "Excellent",
HEALTH == 2 ~ "Very good",
HEALTH == 3 ~ "Good",
HEALTH == 4 ~ "Fair",
HEALTH == 5 ~ "Poor",
.default = NA_character_,
comment = "Self-reported health status"
) |>
step_compute(
log_income = log(INCTOT + 1),
comment = "Log total income"
) |>
bake_steps()
workflow(
list(svy_cps),
survey::svymean(~INCTOT, na.rm = TRUE),
estimation_type = "annual"
)
#> stat value se cv confint_lower
#> <char> <num> <num> <num> <num>
#> 1: survey::svymean: INCTOT 191645236 4277528 0.02232004 183261435
#> confint_upper
#> <num>
#> 1: 200029038IPUMS data requires a free account at https://cps.ipums.org. The DDI XML file provides variable metadata for labeling.
ENIGH – Mexico
The ENIGH (Encuesta Nacional de Ingresos y Gastos de los Hogares) is Mexico’s income and expenditure survey. There is no dedicated R package; data is available from INEGI. Below is a synthetic example that mirrors the real structure.
set.seed(42)
dt_enigh <- data.table(
id = 1:200,
upm = rep(1:40, each = 5),
est_dis = rep(1:10, each = 20),
factor = runif(200, 100, 500),
sexo_jefe = sample(1:2, 200, replace = TRUE),
edad_jefe = sample(18:80, 200, replace = TRUE),
ing_cor = rlnorm(200, 10, 1),
tam_hog = sample(1:8, 200, replace = TRUE)
)
svy_enigh <- Survey$new(
data = dt_enigh,
edition = "2022",
type = "enigh",
psu = "upm",
strata = "est_dis",
engine = "data.table",
weight = add_weight(annual = "factor")
)
svy_enigh <- svy_enigh |>
step_recode(sex_head,
sexo_jefe == 1 ~ "Male",
sexo_jefe == 2 ~ "Female",
.default = NA_character_,
comment = "Sex of household head"
) |>
step_compute(
income_pc = ing_cor / tam_hog,
comment = "Per capita household income"
) |>
bake_steps()
workflow(
list(svy_enigh),
survey::svymean(~income_pc, na.rm = TRUE),
estimation_type = "annual"
)
#> stat value se cv confint_lower
#> <char> <num> <num> <num> <num>
#> 1: survey::svymean: income_pc 11928.34 1473.022 0.1234893 9041.267
#> confint_upper
#> <num>
#> 1: 14815.41DHS – International
The DHS (Demographic and Health Surveys) program covers 90+
countries. The rdhs package (CRAN, GitHub) provides API access.
Model datasets (no authentication needed) are available at https://dhsprogram.com/data/model-datasets.cfm.
library(haven)
# Download the model Individual Recode (no credentials needed)
tf <- tempfile(fileext = ".zip")
download.file(
"https://dhsprogram.com/data/model_data/dhs/zzir62dt.zip",
tf,
mode = "wb", quiet = TRUE
)
td <- tempdir()
unzip(tf, exdir = td)
dta_file <- list.files(td,
pattern = "\\.DTA$", full.names = TRUE,
ignore.case = TRUE
)
dt_dhs <- data.table(read_dta(dta_file[1]))
# DHS weights must be divided by 1,000,000
dt_dhs[, wt := as.numeric(v005) / 1e6]
svy_dhs <- Survey$new(
data = dt_dhs,
edition = "2020",
type = "dhs",
psu = "v001",
strata = "v023",
engine = "data.table",
weight = add_weight(annual = "wt")
)
svy_dhs <- svy_dhs |>
step_recode(education,
v106 == 0 ~ "No education",
v106 == 1 ~ "Primary",
v106 == 2 ~ "Secondary",
v106 == 3 ~ "Higher",
.default = NA_character_,
comment = "Education level (v106)"
) |>
step_compute(
children = as.numeric(v201),
comment = "Children ever born"
) |>
bake_steps()
workflow(
list(svy_dhs),
survey::svymean(~children, na.rm = TRUE),
estimation_type = "annual"
)DHS data requires registration at https://dhsprogram.com. The rdhs package
handles API authentication. Weights (v005) must be divided by 1,000,000
before use.
Recipe portability
The same Recipe structure works regardless of the source
survey:
set.seed(42)
dt_demo <- data.table(
id = 1:100,
age = sample(18:65, 100, replace = TRUE),
income = round(runif(100, 1000, 5000), 2),
w = round(runif(100, 0.5, 2), 4)
)
svy_demo <- Survey$new(
data = dt_demo,
edition = "2023",
type = "demo",
psu = NULL,
engine = "data.table",
weight = add_weight(annual = "w")
)
svy_demo <- svy_demo |>
step_compute(indicator = ifelse(age > 30, 1L, 0L)) |>
step_recode(age_group,
age < 30 ~ "Young",
age >= 30 ~ "Adult",
.default = NA_character_
)
my_recipe <- steps_to_recipe(
name = "Demo Indicators",
user = "Research Team",
svy = svy_demo,
description = "Reusable demographic indicators",
steps = get_steps(svy_demo),
topic = "demographics"
)
doc <- my_recipe$doc()
cat("Inputs:", paste(doc$input_variables, collapse = ", "), "\n")
#> Inputs: age
cat("Outputs:", paste(doc$output_variables, collapse = ", "), "\n")
#> Outputs: indicator, age_groupRecipes capture what transformations to apply, not which survey they came from. A recipe built for EPH can be adapted for PNADc by simply renaming variables.
Next steps
- Getting Started – Survey objects and steps
- Survey Designs and Validation – Stratified and clustered designs, replicate weights
-
Rotating Panels –
RotativePanelSurveyandPoolSurvey - Recipes – Creating, saving, and sharing recipes
- ECH Case Study – Full Uruguay ECH pipeline