Cleaning

Load Packages

# Loading tidyverse package
library(tidyverse)
── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.4     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Load Data

# Import data
read_csv("data/Integrated_Education_EN.csv")
Rows: 108 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): SchoolLevel, Type, SchoolYear
dbl (1): NoOfStudents

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# A tibble: 108 × 4
   SchoolLevel Type  SchoolYear NoOfStudents
   <chr>       <chr> <chr>             <dbl>
 1 Primary     ID    2019/20             810
 2 Primary     ID    2020/21             930
 3 Primary     ID    2021/22             990
 4 Primary     ID    2022/23            1090
 5 Primary     ID    2023/24            1170
 6 Primary     ID    2024/25            1230
 7 Primary     ASD   2019/20            6400
 8 Primary     ASD   2020/21            6880
 9 Primary     ASD   2021/22            6930
10 Primary     ASD   2022/23            6850
# ℹ 98 more rows

Summary of Dataset

# Renaming Dataset
IE <- read_csv("data/Integrated_Education_EN.csv")
Rows: 108 Columns: 4
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): SchoolLevel, Type, SchoolYear
dbl (1): NoOfStudents

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Summarise Dataset
glimpse(IE)
Rows: 108
Columns: 4
$ SchoolLevel  <chr> "Primary", "Primary", "Primary", "Primary", "Primary", "P…
$ Type         <chr> "ID", "ID", "ID", "ID", "ID", "ID", "ASD", "ASD", "ASD", …
$ SchoolYear   <chr> "2019/20", "2020/21", "2021/22", "2022/23", "2023/24", "2…
$ NoOfStudents <dbl> 810, 930, 990, 1090, 1170, 1230, 6400, 6880, 6930, 6850, …

Data Cleaning

# Renaming Dataset
IE_Clean <- IE |>
  rename(level = `SchoolLevel`, 
         type = `Type`,
         start_year = `SchoolYear`,
         no_of_students = `NoOfStudents`)
# Converting date to year
IE_Clean2 <- IE_Clean |>
  mutate(start_year = substr(start_year, 1, 4)) |>
  mutate(start_year = as.integer(start_year)) |>
  select(level, type, start_year, no_of_students)
head(IE_Clean2)
# A tibble: 6 × 4
  level   type  start_year no_of_students
  <chr>   <chr>      <int>          <dbl>
1 Primary ID          2019            810
2 Primary ID          2020            930
3 Primary ID          2021            990
4 Primary ID          2022           1090
5 Primary ID          2023           1170
6 Primary ID          2024           1230
# Renaming IE_Clean2 to df
df = IE_Clean2
# Save to .RData
save(df, file = "data/integrated_Education_EN copy.csv")
# Save to csv
write_csv(df, "data/integrated_Education_EN copy.csv")