This vignette shows how to use educabR to explore school infrastructure across Brazil using the School Census. We look at internet access, libraries, science labs, sports courts, and accessibility features – and how they vary by region, administrative type, and location.
The School Census contains one row per school (~217,000 schools in 2023) with over 400 variables covering infrastructure, staffing, and programs.
# Download all schools for 2023
escolas <- get_censo_escolar(year = 2023)
# Or filter by state for faster exploration
escolas_sp <- get_censo_escolar(year = 2023, uf = "SP")Note: the full national file is about 30 MB compressed. When filtering by state, all rows are read before filtering, so the first call may take a moment.
The School Census uses binary columns (1 = yes,
0 = no) for each infrastructure item. Let us compute the
percentage of schools that have each resource nationwide.
indicators <- c(
"in_internet",
"in_banda_larga",
"in_biblioteca",
"in_laboratorio_informatica",
"in_laboratorio_ciencias",
"in_quadra_esportes",
"in_agua_potavel",
"in_esgoto_rede_publica"
)
infra_summary <-
escolas |>
summarise(across(all_of(indicators), ~ mean(. == 1, na.rm = TRUE) * 100)) |>
pivot_longer(everything(), names_to = "indicator", values_to = "pct") |>
mutate(
label = c(
"Internet", "Broadband", "Library", "Computer lab",
"Science lab", "Sports court", "Drinking water", "Public sewage"
)
)
ggplot(infra_summary, aes(x = reorder(label, pct), y = pct)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Percentage of Schools with Key Infrastructure (2023)",
x = NULL,
y = "% of schools"
) +
theme_minimal()Federal, state, municipal, and private schools have very different
resource levels. The tp_dependencia column encodes the
administrative type.
admin_labels <- c(
"1" = "Federal",
"2" = "State",
"3" = "Municipal",
"4" = "Private"
)
infra_admin <-
escolas |>
mutate(admin = admin_labels[as.character(tp_dependencia)]) |>
group_by(admin) |>
summarise(
Internet = mean(in_internet == 1, na.rm = TRUE) * 100,
Library = mean(in_biblioteca == 1, na.rm = TRUE) * 100,
`Computer lab` = mean(in_laboratorio_informatica == 1, na.rm = TRUE) * 100,
`Science lab` = mean(in_laboratorio_ciencias == 1, na.rm = TRUE) * 100,
.groups = "drop"
) |>
pivot_longer(-admin, names_to = "resource", values_to = "pct")
ggplot(infra_admin, aes(x = resource, y = pct, fill = admin)) +
geom_col(position = "dodge") +
labs(
title = "School Infrastructure by Administrative Type (2023)",
x = NULL,
y = "% of schools",
fill = "Type"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))Northern and Northeastern states typically have fewer resources than the South and Southeast. Grouping by region reveals the gap.
region_labels <- c(
"Norte" = "North",
"Nordeste" = "Northeast",
"Sudeste" = "Southeast",
"Sul" = "South",
"Centro-Oeste" = "Central-West"
)
infra_region <-
escolas |>
mutate(region = region_labels[no_regiao]) |>
group_by(region) |>
summarise(
Internet = mean(in_internet == 1, na.rm = TRUE) * 100,
Library = mean(in_biblioteca == 1, na.rm = TRUE) * 100,
`Science lab` = mean(in_laboratorio_ciencias == 1, na.rm = TRUE) * 100,
`Sports court` = mean(in_quadra_esportes == 1, na.rm = TRUE) * 100,
.groups = "drop"
) |>
pivot_longer(-region, names_to = "resource", values_to = "pct")
ggplot(infra_region, aes(x = region, y = pct, fill = resource)) +
geom_col(position = "dodge") +
labs(
title = "School Infrastructure by Region (2023)",
x = NULL,
y = "% of schools",
fill = NULL
) +
theme_minimal()The tp_localizacao column distinguishes urban (1) from
rural (2) schools. The infrastructure gap between them is one of the
starkest in Brazilian education.
infra_location <-
escolas |>
mutate(
location = ifelse(tp_localizacao == 1, "Urban", "Rural")
) |>
group_by(location) |>
summarise(
Internet = mean(in_internet == 1, na.rm = TRUE) * 100,
Broadband = mean(in_banda_larga == 1, na.rm = TRUE) * 100,
Library = mean(in_biblioteca == 1, na.rm = TRUE) * 100,
`Computer lab` = mean(in_laboratorio_informatica == 1, na.rm = TRUE) * 100,
`Science lab` = mean(in_laboratorio_ciencias == 1, na.rm = TRUE) * 100,
`Sports court` = mean(in_quadra_esportes == 1, na.rm = TRUE) * 100,
.groups = "drop"
) |>
pivot_longer(-location, names_to = "resource", values_to = "pct")
ggplot(infra_location, aes(x = resource, y = pct, fill = location)) +
geom_col(position = "dodge") +
scale_fill_manual(values = c("Urban" = "steelblue", "Rural" = "coral")) +
labs(
title = "School Infrastructure: Urban vs Rural (2023)",
x = NULL,
y = "% of schools",
fill = NULL
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 20, hjust = 1))The School Census tracks specific accessibility features. The
in_acessibilidade_inexistente flag marks schools with no
accessibility at all.
access_cols <- c(
"in_acessibilidade_rampas",
"in_acessibilidade_corrimao",
"in_acessibilidade_elevador",
"in_acessibilidade_pisos_tateis",
"in_acessibilidade_sinal_sonoro",
"in_acessibilidade_sinal_tatil",
"in_acessibilidade_sinal_visual",
"in_acessibilidade_inexistente"
)
access_labels <- c(
"Ramps", "Handrails", "Elevator", "Tactile floors",
"Sound signals", "Tactile signals", "Visual signals", "None"
)
access_summary <-
escolas |>
summarise(across(all_of(access_cols), ~ mean(. == 1, na.rm = TRUE) * 100)) |>
pivot_longer(everything(), names_to = "feature", values_to = "pct") |>
mutate(label = access_labels)
ggplot(access_summary, aes(x = reorder(label, pct), y = pct)) +
geom_col(fill = "#2a9d8f") +
coord_flip() +
labs(
title = "School Accessibility Features (2023)",
x = NULL,
y = "% of schools"
) +
theme_minimal()A per-state view highlights which states are lagging behind in digital connectivity.
internet_uf <-
escolas |>
group_by(sg_uf) |>
summarise(
pct_internet = mean(in_internet == 1, na.rm = TRUE) * 100,
.groups = "drop"
) |>
arrange(pct_internet)
ggplot(internet_uf, aes(x = reorder(sg_uf, pct_internet), y = pct_internet)) +
geom_col(fill = "steelblue") +
coord_flip() +
labs(
title = "Schools with Internet Access by State (2023)",
x = NULL,
y = "% of schools"
) +
theme_minimal()