eda_r – Blog

Author

invictus

Setup

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

library(naniar)
library(scales)


Attaching package: 'scales'

The following object is masked from 'package:purrr':

    discard

The following object is masked from 'package:readr':

    col_factor

library(summarytools)

Warning: no DISPLAY variable so Tk is not available

system might not have X11 capabilities; in case of errors when using dfSummary(), set st_options(use.x11 = FALSE)

Attaching package: 'summarytools'

The following object is masked from 'package:tibble':

    view

library(DataExplorer)
library(explore)


Attaching package: 'explore'

The following object is masked from 'package:naniar':

    replace_na_with

library(corrr)

Options

options(scipen = 999)

Load Data

df_student_gen_23 <- read_csv('student_list_generation_23.csv')

Rows: 8082 Columns: 5
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Nama, Kelas Lama, Kelas Baru
dbl (2): No, NPM

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

df_student_gen_24 <- read_csv('student_list_generation_24.csv')

Rows: 8636 Columns: 6
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (4): No Pend., Nama, Kelas, Keterangan
dbl (2): No, NPM

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect Data

df_student_gen_23

# A tibble: 8,082 × 5
      No      NPM Nama                        `Kelas Lama` `Kelas Baru`
   <dbl>    <dbl> <chr>                       <chr>        <chr>       
 1     1 60223026 A'NAS TASYA GUSTI FIANA     1EC01        2EC01       
 2     2 10123014 A. PEBRIYAN MAEYADI PUTRA   1KA01        2KA10       
 3     3 10223027 AARON MATTHEW PUTRA IBRAHIM 1EA01        2EA04       
 4     4 10623004 ABANG MAULANA JAVID FANSURI 1SA05        2SA05       
 5     5 10223028 ABDAN RIJAL SYAKURA         1EA23        2EA28       
 6     6 10223029 ABDAN SYAKUR                1EA22        2EA22       
 7     7 10823003 ABDEL RAUF FEBROZA          1MA01        2MA04       
 8     8 50423017 ABDHAN ZAKI ALFAREZA        1IA01        2IA17       
 9     9 50423018 ABDIL NAYAKA RIZKY          1IA02        2IA01       
10    10 10523002 ABDILLAH ISMAIL ADHA        1PA24        2PA28       
# ℹ 8,072 more rows

df_student_gen_23 |> 
  glimpse()

Rows: 8,082
Columns: 5
$ No           <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
$ NPM          <dbl> 60223026, 10123014, 10223027, 10623004, 10223028, 1022302…
$ Nama         <chr> "A'NAS TASYA GUSTI FIANA", "A. PEBRIYAN MAEYADI PUTRA", "…
$ `Kelas Lama` <chr> "1EC01", "1KA01", "1EA01", "1SA05", "1EA23", "1EA22", "1M…
$ `Kelas Baru` <chr> "2EC01", "2KA10", "2EA04", "2SA05", "2EA28", "2EA22", "2M…

df_student_gen_24

# A tibble: 8,636 × 6
      No `No Pend.` Nama                           NPM Kelas  Keterangan        
   <dbl> <chr>      <chr>                        <dbl> <chr>  <chr>             
 1     1 I241962    A FARHAN ASSIDQI          10824001 1MA01  S1-Ilmu Komunikas…
 2     2 I244822    A'PIFAH DZAKIAH           20624001 1SB01  S1-Pariwisata Pag…
 3     3 I243593    A'ROOF RAIHAN HAKIM       20224001 1EB19  SarMag S1 Akuntan…
 4     4 I247622    AAS TRI HAYATI            11524262 1PA06  S1-Psikologi Pagi…
 5     5 A240094    AATHIFAH ALISHA FAUZIYAH  19124005 ALH241 S1-Sistem Informa…
 6     6 T242801    ABABIL NUR AHMAD          10824002 1MA02  S1-Ilmu Komunikas…
 7     7 T242992    ABBAS ALFIANSYAH ARRASYID 50424001 1IA16  S1-Informatika Pa…
 8     8 P108703    ABBY PINANDITA AL'GHIFARI 30324001 1TC03  S1-Desain Interio…
 9     9 I242606    ABD. RAFA KHARIM          10124001 1KA24  S1-Sistem Informa…
10    10 I248539    ABDALLAH                  31424417 1ID02  S1-Teknik Industr…
# ℹ 8,626 more rows

df_student_gen_24 |> 
  glimpse()

Rows: 8,636
Columns: 6
$ No         <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, …
$ `No Pend.` <chr> "I241962", "I244822", "I243593", "I247622", "A240094", "T24…
$ Nama       <chr> "A FARHAN ASSIDQI", "A'PIFAH DZAKIAH", "A'ROOF RAIHAN HAKIM…
$ NPM        <dbl> 10824001, 20624001, 20224001, 11524262, 19124005, 10824002,…
$ Kelas      <chr> "1MA01", "1SB01", "1EB19", "1PA06", "ALH241", "1MA02", "1IA…
$ Keterangan <chr> "S1-Ilmu Komunikasi Pagi Depok", "S1-Pariwisata Pagi Depok"…

Understand Data

Check Data Quality

Duplicate

df_student_gen_23 |> 
  get_dupes(Nama, NPM)

# A tibble: 4 × 6
  Nama                   NPM dupe_count    No `Kelas Lama` `Kelas Baru`
  <chr>                <dbl>      <int> <dbl> <chr>        <chr>       
1 AHMAD FAUZAN      10223100          2   300 1EA19        2EA19       
2 AHMAD FAUZAN      10223100          2   301 1EA19        2EA19       
3 MUHAMMAD RAMADHAN 10123884          2  1060 1KA03        2KA20       
4 MUHAMMAD RAMADHAN 10123884          2  1061 1KA03        2KA20

df_student_gen_24 |> 
  get_dupes(Nama, NPM)

No duplicate combinations found of: Nama, NPM

# A tibble: 0 × 7
# ℹ 7 variables: Nama <chr>, NPM <dbl>, dupe_count <int>, No <dbl>,
#   No Pend. <chr>, Kelas <chr>, Keterangan <chr>

Missing Data

df_student_gen_23 |> 
  vis_miss()

df_student_gen_24 |> 
  vis_miss()

Inconsistency

df_student_gen_23

# A tibble: 8,082 × 5
      No      NPM Nama                        `Kelas Lama` `Kelas Baru`
   <dbl>    <dbl> <chr>                       <chr>        <chr>       
 1     1 60223026 A'NAS TASYA GUSTI FIANA     1EC01        2EC01       
 2     2 10123014 A. PEBRIYAN MAEYADI PUTRA   1KA01        2KA10       
 3     3 10223027 AARON MATTHEW PUTRA IBRAHIM 1EA01        2EA04       
 4     4 10623004 ABANG MAULANA JAVID FANSURI 1SA05        2SA05       
 5     5 10223028 ABDAN RIJAL SYAKURA         1EA23        2EA28       
 6     6 10223029 ABDAN SYAKUR                1EA22        2EA22       
 7     7 10823003 ABDEL RAUF FEBROZA          1MA01        2MA04       
 8     8 50423017 ABDHAN ZAKI ALFAREZA        1IA01        2IA17       
 9     9 50423018 ABDIL NAYAKA RIZKY          1IA02        2IA01       
10    10 10523002 ABDILLAH ISMAIL ADHA        1PA24        2PA28       
# ℹ 8,072 more rows

Class

df_student_gen_23 |> 
  mutate(class_length = str_length(`Kelas Baru`)) |> 
  count(class_length)

# A tibble: 2 × 2
  class_length     n
         <int> <int>
1            3    93
2            5  7989

df_student_gen_24 |> 
    mutate(class_length = str_length(Kelas)) |> 
    count(class_length)

# A tibble: 2 × 2
  class_length     n
         <int> <int>
1            5  8617
2            6    19

df_student_gen_24 |> 
  distinct(Keterangan)

# A tibble: 76 × 1
   Keterangan                                   
   <chr>                                        
 1 S1-Ilmu Komunikasi Pagi Depok                
 2 S1-Pariwisata Pagi Depok                     
 3 SarMag S1 Akuntansi - S2 Manajemen Sistem Inf
 4 S1-Psikologi Pagi Depok                      
 5 S1-Sistem Informasi Pagi Depok               
 6 S1-Informatika Pagi Kalimalang               
 7 S1-Desain Interior Pagi Depok                
 8 S1-Sistem Informasi Pagi Kalimalang          
 9 S1-Teknik Industri Pagi Depok                
10 S1-Informatika Pagi Depok                    
# ℹ 66 more rows

df_student_gen_24 |> 
  mutate(Keterangan_split = str_split(Keterangan, '-| ')) |> 
  mutate(Keterangan_length = map_int(Keterangan_split, length)) |> 
  count(Keterangan_length)

# A tibble: 7 × 2
  Keterangan_length     n
              <int> <int>
1                 4  4323
2                 5  3798
3                 6   209
4                 7   163
5                 8     9
6                 9    66
7                10    68

Inaccuracy

Data Quality Summary

4 Duplicate Rows
No Missing Values
Most classes follow 5 letter pattern, in addition to several special classes that vary
Most class description follow a consistent pattern, in addition to several special classes

Further Action: dedicated page for cleaning the data

Clean Data

The data is cleaned on

df_combined_cleaned <- read_csv('combined_cleaned_dataset.csv')

Rows: 16716 Columns: 10
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (7): major_code, class_number, class_code, major, degree, shift, branch
dbl (3): student_id, year_code, generation_year

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Analyze Data

df_combined_cleaned |> 
  filter(is.na(major))

# A tibble: 0 × 10
# ℹ 10 variables: student_id <dbl>, year_code <dbl>, major_code <chr>,
#   class_number <chr>, class_code <chr>, major <chr>, generation_year <dbl>,
#   degree <chr>, shift <chr>, branch <chr>

Major Enrollment by Generation Year

df_major_enrollment <-  df_combined_cleaned |> 
  count(major, generation_year) |>
  pivot_wider(names_from = generation_year, values_from = n) |> 
  mutate(change = `2024` - `2023`)

df_major_enrollment |> 
  arrange(desc(change))

# A tibble: 28 × 4
   major            `2023` `2024` change
   <chr>             <int>  <int>  <int>
 1 Manajemen          1335   1518    183
 2 Ilmu Komunikasi    1072   1160     88
 3 Akuntansi           607    685     78
 4 Farmasi              64    133     69
 5 Sistem Informasi   1214   1268     54
 6 Teknik Sipil        148    197     49
 7 Teknik Industri     340    387     47
 8 Desain Interior      14     60     46
 9 Pariwisata           31     64     33
10 Teknik Mesin        126    158     32
# ℹ 18 more rows

df_major_enrollment <- df_major_enrollment |> 
  mutate(major = as.factor(major)) |> 
  mutate(major = fct_reorder(major, `2024`))

Warning: There was 1 warning in `mutate()`.
ℹ In argument: `major = fct_reorder(major, `2024`)`.
Caused by warning:
! `fct_reorder()` removing 2 missing values.
ℹ Use `.na_rm = TRUE` to silence this message.
ℹ Use `.na_rm = FALSE` to preserve NAs.

df_major_enrollment |> 
  ggplot() + 
  geom_col(aes(x = `2024`, y = major, fill = "2024")) +
  geom_col(aes(x = `2023`, y = major, fill = "2023")) +
  scale_fill_manual(name = "Year",
                    values = c("2024" = "lightblue", "2023" = "#4d4d4d")) +
  labs(title = "Comparing Gunadarma Enrollment Rate from 2023 to 2024",
       x = "Total Students",
       y = "Major")

Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_col()`).

df_major_enrollment |> 
  ggplot(aes(change, fct_reorder(major, change))) +
  geom_col() +
  labs(
    title = 'Total Student Difference from 2023 to 2024'
    ,x = 'Student Difference',
    y = 'Major'
    )

Warning: `fct_reorder()` removing 2 missing values.
ℹ Use `.na_rm = TRUE` to silence this message.
ℹ Use `.na_rm = FALSE` to preserve NAs.

Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_col()`).

ggsave('annual_enrollment_2.png')

Saving 7 x 5 in image

Warning: `fct_reorder()` removing 2 missing values.
ℹ Use `.na_rm = TRUE` to silence this message.
ℹ Use `.na_rm = FALSE` to preserve NAs.

Warning: Removed 2 rows containing missing values or values outside the scale range
(`geom_col()`).

Setup

Options

Load Data

Inspect Data

Understand Data

Check Data Quality

Duplicate

Missing Data

Inconsistency

Class

Inaccuracy

Data Quality Summary

Clean Data

Analyze Data

Major Enrollment by Generation Year

Correlation