EDA on Amazon Prime Movies and TV Shows

Case Study

Tableau

Exploratory Data Analysis on Amazon Prime Titles

Author

invictus

Published

August 26, 2024

Setup

Library

library(tidyverse)

── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
✔ dplyr     1.1.4     ✔ readr     2.1.5
✔ forcats   1.0.0     ✔ stringr   1.5.1
✔ ggplot2   3.5.1     ✔ tibble    3.2.1
✔ lubridate 1.9.3     ✔ tidyr     1.3.1
✔ purrr     1.0.2     
── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
✖ dplyr::filter() masks stats::filter()
✖ dplyr::lag()    masks stats::lag()
ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(naniar)
library(visdat)
library(DataExplorer)
library(janitor)


Attaching package: 'janitor'

The following objects are masked from 'package:stats':

    chisq.test, fisher.test

library(validate)


Attaching package: 'validate'

The following object is masked from 'package:naniar':

    all_complete

The following object is masked from 'package:dplyr':

    expr

The following object is masked from 'package:ggplot2':

    expr

Dataframe

df <- read_csv('./amazon_prime_titles.csv')

Rows: 9668 Columns: 12
── Column specification ────────────────────────────────────────────────────────
Delimiter: ","
chr (11): show_id, type, title, director, cast, country, date_added, rating,...
dbl  (1): release_year

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Inspect Dataset

Understand the Dataset

df |> 
  head()

# A tibble: 6 × 12
  show_id type  title      director cast  country date_added release_year rating
  <chr>   <chr> <chr>      <chr>    <chr> <chr>   <chr>             <dbl> <chr> 
1 s1      Movie The Grand… Don McK… Bren… Canada  March 30,…         2014 <NA>  
2 s2      Movie Take Care… Girish … Mahe… India   March 30,…         2018 13+   
3 s3      Movie Secrets o… Josh We… Tom … United… March 30,…         2017 <NA>  
4 s4      Movie Pink: Sta… Sonia A… Inte… United… March 30,…         2014 <NA>  
5 s5      Movie Monster M… Giles F… Harr… United… March 30,…         1989 <NA>  
6 s6      Movie Living Wi… Paul We… Greg… United… March 30,…         1989 <NA>  
# ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>

df |> tail()

# A tibble: 6 × 12
  show_id type    title    director cast  country date_added release_year rating
  <chr>   <chr>   <chr>    <chr>    <chr> <chr>   <chr>             <dbl> <chr> 
1 s9663   Movie   River    Emily S… Mary… <NA>    <NA>               2021 16+   
2 s9664   Movie   Pride O… Joseph … Leo … <NA>    <NA>               1940 7+    
3 s9665   TV Show Planet … <NA>     DICK… <NA>    <NA>               2018 13+   
4 s9666   Movie   Outpost  Steve B… Ray … <NA>    <NA>               2008 R     
5 s9667   TV Show Maradon… <NA>     Este… <NA>    <NA>               2021 TV-MA 
6 s9668   Movie   Harry B… Daniel … Mich… <NA>    <NA>               2010 R     
# ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>

df |> 
  glimpse()

Rows: 9,668
Columns: 12
$ show_id      <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
$ type         <chr> "Movie", "Movie", "Movie", "Movie", "Movie", "Movie", "Mo…
$ title        <chr> "The Grand Seduction", "Take Care Good Night", "Secrets o…
$ director     <chr> "Don McKellar", "Girish Joshi", "Josh Webber", "Sonia And…
$ cast         <chr> "Brendan Gleeson, Taylor Kitsch, Gordon Pinsent", "Mahesh…
$ country      <chr> "Canada", "India", "United States", "United States", "Uni…
$ date_added   <chr> "March 30, 2021", "March 30, 2021", "March 30, 2021", "Ma…
$ release_year <dbl> 2014, 2018, 2017, 2014, 1989, 1989, 2017, 2016, 2017, 199…
$ rating       <chr> NA, "13+", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ duration     <chr> "113 min", "110 min", "74 min", "69 min", "45 min", "52 m…
$ listed_in    <chr> "Comedy, Drama", "Drama, International", "Action, Drama, …
$ description  <chr> "A small fishing village must procure a local doctor to s…

What’s a show_id?

df |> 
  mutate(show_id = parse_number(show_id)) |>
  pull(show_id) |> 
  is_linear_sequence()

[1] TRUE

Check Data Quality

Missing Values

df |> 
  vis_miss()

df |> 
  gg_miss_upset()

Duplicate Rows

df |> 
  get_dupes()

No variable names specified - using all columns.

No duplicate combinations found of: show_id, type, title, director, cast, country, date_added, release_year, rating, ... and 3 other variables

# A tibble: 0 × 13
# ℹ 13 variables: show_id <chr>, type <chr>, title <chr>, director <chr>,
#   cast <chr>, country <chr>, date_added <chr>, release_year <dbl>,
#   rating <chr>, duration <chr>, listed_in <chr>, description <chr>,
#   dupe_count <int>

df |> 
  get_dupes(show_id)

No duplicate combinations found of: show_id

# A tibble: 0 × 13
# ℹ 13 variables: show_id <chr>, dupe_count <int>, type <chr>, title <chr>,
#   director <chr>, cast <chr>, country <chr>, date_added <chr>,
#   release_year <dbl>, rating <chr>, duration <chr>, listed_in <chr>,
#   description <chr>

df |> 
  get_dupes(description)

# A tibble: 394 × 13
   description dupe_count show_id type   title director cast  country date_added
   <chr>            <int> <chr>   <chr>  <chr> <chr>    <chr> <chr>   <chr>     
 1 1                   15 s7128   Movie  Act … 1        1     <NA>    <NA>      
 2 1                   15 s7129   Movie  Act … 1        1     <NA>    <NA>      
 3 1                   15 s7218   TV Sh… Seri… <NA>     1     <NA>    <NA>      
 4 1                   15 s8709   TV Sh… Clip… <NA>     1     <NA>    <NA>      
 5 1                   15 s8728   TV Sh… Act … <NA>     1     <NA>    September…
 6 1                   15 s8729   TV Sh… ACT … <NA>     1     <NA>    <NA>      
 7 1                   15 s8753   TV Sh… ACT … <NA>     1     <NA>    <NA>      
 8 1                   15 s8754   TV Sh… ACT … <NA>     1     <NA>    <NA>      
 9 1                   15 s8756   Movie  ACT … 1        1     <NA>    <NA>      
10 1                   15 s8758   Movie  ACT … 1        1     <NA>    <NA>      
# ℹ 384 more rows
# ℹ 4 more variables: release_year <dbl>, rating <chr>, duration <chr>,
#   listed_in <chr>

df |> 
  get_dupes(cast) |> 
  filter(!is.na(cast)) |> 
  distinct(cast)

# A tibble: 210 × 1
   cast                            
   <chr>                           
 1 Maggie Binkley                  
 2 1                               
 3 Anne-Marie Newland              
 4 Cassandra Peterson              
 5 Grace Tamayo, Erin Webbs        
 6 Gene Autry, Champion, Gail Davis
 7 Stevin John                     
 8 Eddie Izzard                    
 9 Gallagher                       
10 LB, Aaron Michael               
# ℹ 200 more rows

Data Constraints

Check Numerical Data

df |> 
  ggplot(aes(release_year)) +
  geom_histogram() +
  labs(title = 'Release Year')

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df |> 
  mutate(duration = parse_number(duration)) |> 
  ggplot(aes(duration)) +
  geom_histogram() +
  labs(title = 'Duration (in Minute)')

`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

df |> 
  mutate(duration = parse_number(duration)) |> 
  ggplot(aes(duration)) +
  geom_histogram(binwidth = 1) +
  coord_cartesian(xlim = c(200,700), ylim = c(0,9))

df |> 
  mutate(duration = parse_number(duration)) |> 
  arrange(desc(duration)) |> 
  relocate(title, duration, type, description)

# A tibble: 9,668 × 12
   title    duration type  description show_id director cast  country date_added
   <chr>       <dbl> <chr> <chr>       <chr>   <chr>    <chr> <chr>   <chr>     
 1 Soothin…      601 Movie Black scre… s934    Mark Kn… <NA>  <NA>    <NA>      
 2 Himalay…      550 Movie This ambie… s3851   Mark Kn… Niv … <NA>    <NA>      
 3 Midnigh…      541 Movie Our most p… s1674   Mark Kn… <NA>  <NA>    <NA>      
 4 Gentle …      541 Movie One of our… s2368   Mark Kn… <NA>  <NA>    <NA>      
 5 Gentle …      541 Movie Beautiful … s2369   Mark Kn… <NA>  <NA>    <NA>      
 6 9 Hour …      541 Movie Black scre… s3446   Mark Kn… <NA>  <NA>    <NA>      
 7 Thunder…      541 Movie Black scre… s4088   Mark Kn… <NA>  <NA>    <NA>      
 8 Pacific…      541 Movie For a full… s4129   Mark Kn… <NA>  <NA>    <NA>      
 9 New Yor…      541 Movie Are you a … s4278   Mark Kn… <NA>  <NA>    <NA>      
10 Gentle …      541 Movie Drift off … s4584   Mark Kn… <NA>  <NA>    <NA>      
# ℹ 9,658 more rows
# ℹ 3 more variables: release_year <dbl>, rating <chr>, listed_in <chr>

Check Categorical Data

df |> 
  count(type)

# A tibble: 2 × 2
  type        n
  <chr>   <int>
1 Movie    7814
2 TV Show  1854

df |> 
  count(listed_in)

# A tibble: 518 × 2
   listed_in                                     n
   <chr>                                     <int>
 1 Action                                      238
 2 Action, Adventure                            50
 3 Action, Adventure, Animation                  2
 4 Action, Adventure, Anime                      4
 5 Action, Adventure, Comedy                    26
 6 Action, Adventure, Documentary                3
 7 Action, Adventure, Drama                     37
 8 Action, Adventure, Faith and Spirituality     1
 9 Action, Adventure, Fantasy                    7
10 Action, Adventure, Horror                    10
# ℹ 508 more rows

Possible Null Value

df |> 
  vis_expect(function(x) nchar(x) < 10)

nchar_limit <- 10
df_possible_null <- df |>
  select(title, cast, description, director)

v_nchar <- validator(
 title = nchar(title) > nchar_limit,
 cast =  nchar(cast) > nchar_limit,
 director = nchar(director) > nchar_limit,
 description = nchar(description) > nchar_limit
)

cf_nchar <- confront(df_possible_null, v_nchar)
summary(cf_nchar)

         name items passes fails  nNA error warning
1       title  9668   7447  2221    0 FALSE   FALSE
2        cast  9668   8206   229 1233 FALSE   FALSE
3    director  9668   6445  1141 2082 FALSE   FALSE
4 description  9668   9638    30    0 FALSE   FALSE
                        expression
1       nchar(title) > nchar_limit
2        nchar(cast) > nchar_limit
3    nchar(director) > nchar_limit
4 nchar(description) > nchar_limit

Title

df |> 
  select(release_year, type:director, description) |> 
  filter(nchar(title) < 3)

# A tibble: 9 × 5
  release_year type    title director          description                      
         <dbl> <chr>   <chr> <chr>             <chr>                            
1         2014 Movie   41    Glenn Triggs      "A young man discovers a hole in…
2         2021 Movie   X     Scott J. Ramsey   "The chair of a mysterious found…
3         2007 Movie   YO    Rafa Cortes       "A handyman named Hans moves to …
4         1994 Movie   IQ    Fred Schepisi     "All the world knows about Einst…
5         2019 Movie   M     Yolande Zauberman "\"M\" as Menahem, child prodigy…
6         2021 Movie   We    Mani Nasry        "In the pursuit of love and happ…
7         2008 Movie   21    Robert Luketic    "Academy Award Winner Kevin Spac…
8         2021 TV Show GD    <NA>              "In the 1930s,America was just f…
9         1991 Movie   Us    Michael Landon    "In his final role, Michael Land…

Cast

df |> 
 names()

 [1] "show_id"      "type"         "title"        "director"     "cast"        
 [6] "country"      "date_added"   "release_year" "rating"       "duration"    
[11] "listed_in"    "description"

df |> 
  select(cast, description) |> 
  filter(nchar(cast) < 4)

# A tibble: 38 × 2
   cast  description                                                            
   <chr> <chr>                                                                  
 1 TJ    In his debut special taped in Brooklyn, Haitian born stand-up comedian…
 2 Nas   When Nas was just 19 years old, he composed what would become one of h…
 3 1     2                                                                      
 4 1     Series before 1C onboarding                                            
 5 1     Title Post onboarding 8                                                
 6 1     1                                                                      
 7 1     1                                                                      
 8 1     clip1                                                                  
 9 1     1                                                                      
10 1     ACT 4 - Title after removing 1C - 5                                    
# ℹ 28 more rows

Director

df |> 
  relocate(director, title) |> 
  filter(nchar(director) < 3)

# A tibble: 17 × 12
   director title     show_id type  cast  country date_added release_year rating
   <chr>    <chr>     <chr>   <chr> <chr> <chr>   <chr>             <dbl> <chr> 
 1 TC       The Miss… s6724   Movie TC, … <NA>    <NA>               2021 18+   
 2 1        Title Po… s6931   Movie 1     <NA>    <NA>               2021 18+   
 3 1        Act 4 - … s7128   Movie 1     <NA>    <NA>               2021 ALL   
 4 1        Act 4 - … s7129   Movie 1     <NA>    <NA>               2021 ALL   
 5 1        Clip: 1   s7140   Movie 1     <NA>    <NA>               2021 18+   
 6 1        Clip: AC… s7239   Movie 1     <NA>    <NA>               2021 ALL   
 7 1        Clip: AC… s7240   Movie 1     <NA>    <NA>               2021 ALL   
 8 1        ACT 4 - … s7246   Movie 1     <NA>    <NA>               2021 ALL   
 9 1        Clip: AC… s7259   Movie 1     <NA>    <NA>               2021 7+    
10 1        Act 6 - … s8502   Movie 1     <NA>    <NA>               2021 ALL   
11 1        Act 5 - … s8503   Movie 1     <NA>    <NA>               2021 ALL   
12 1        ACT 2 - … s8756   Movie 1     <NA>    <NA>               2021 ALL   
13 1        ACT 2 - … s8757   Movie 1     <NA>    <NA>               2021 ALL   
14 1        ACT 2 - … s8758   Movie 1     <NA>    <NA>               2021 ALL   
15 1        ACT 2 - … s8759   Movie 1     <NA>    <NA>               2021 ALL   
16 1        Clip: Ac… s9463   Movie 1     <NA>    <NA>               2021 ALL   
17 1        Clip: Ac… s9464   Movie 1     <NA>    <NA>               2021 ALL   
# ℹ 3 more variables: duration <chr>, listed_in <chr>, description <chr>

Description

df |> 
  relocate(description, title) |> 
  filter(nchar(description) < 15)

# A tibble: 32 × 12
   description title             show_id type  director cast  country date_added
   <chr>       <chr>             <chr>   <chr> <chr>    <chr> <chr>   <chr>     
 1 a           Simple Gifts: Th… s1001   Movie Habib A… <NA>  <NA>    <NA>      
 2 Test title  Movie10y          s1589   Movie Test ti… Prod… <NA>    <NA>      
 3 Test title  Movie10x          s1590   Movie Test ti… <NA>  <NA>    <NA>      
 4 test        IN and Classic -… s2123   Movie <NA>     <NA>  <NA>    <NA>      
 5 Elfen Lied  Elfen Lied        s2564   TV S… <NA>     Kira… <NA>    <NA>      
 6 test        Dmec and other    s5378   Movie <NA>     <NA>  <NA>    <NA>      
 7 2           Series before 1C… s6419   TV S… <NA>     1     <NA>    July 12, …
 8 1           Act 4 - Title be… s7128   Movie 1        1     <NA>    <NA>      
 9 1           Act 4 - Title be… s7129   Movie 1        1     <NA>    <NA>      
10 clip1       Clip: 1           s7140   Movie 1        1     <NA>    <NA>      
# ℹ 22 more rows
# ℹ 4 more variables: release_year <dbl>, rating <chr>, duration <chr>,
#   listed_in <chr>

Cross Check

Categorical Data

Source: https://www.amazon.com/Series-before-1C-onboarding-2/dp/B09942SNJQ

Numerical Data

Source: https://www.amazon.com/norte-sleep-hours-black-screen/dp/B0799FS2RY

Summary of Preliminary Data Inspection

No duplicate rows
19% of missing values. 98% of them came from date_added. Which normally isn’t published anyway.
Found nearly 100 suspicious values such as ‘1’, ‘2’, ‘test’, and ‘test title’ across cast, director, and description. They were cross-checked and were found on Amazon Prime’s website, which indicate a ‘fake’ title posted for the purpose of testing.
Outliers such as unusually long movie length are confirmed to exist on Amazon Prime.
duration data type depends on the title type. TV Show uses season and Movie uses minute

Clean Data

Remove columns with too many missing values

df |> 
  names()

 [1] "show_id"      "type"         "title"        "director"     "cast"        
 [6] "country"      "date_added"   "release_year" "rating"       "duration"    
[11] "listed_in"    "description"

df_removed_columns <- df |> 
  select(-country, -date_added)

df_removed_columns |> 
  names()

 [1] "show_id"      "type"         "title"        "director"     "cast"        
 [6] "release_year" "rating"       "duration"     "listed_in"    "description"

Rename columns

df_renamed_column <- df_removed_columns |> 
  rename(genre = listed_in,
         id = show_id)

Correct data types

df |> 
  glimpse()

Rows: 9,668
Columns: 12
$ show_id      <chr> "s1", "s2", "s3", "s4", "s5", "s6", "s7", "s8", "s9", "s1…
$ type         <chr> "Movie", "Movie", "Movie", "Movie", "Movie", "Movie", "Mo…
$ title        <chr> "The Grand Seduction", "Take Care Good Night", "Secrets o…
$ director     <chr> "Don McKellar", "Girish Joshi", "Josh Webber", "Sonia And…
$ cast         <chr> "Brendan Gleeson, Taylor Kitsch, Gordon Pinsent", "Mahesh…
$ country      <chr> "Canada", "India", "United States", "United States", "Uni…
$ date_added   <chr> "March 30, 2021", "March 30, 2021", "March 30, 2021", "Ma…
$ release_year <dbl> 2014, 2018, 2017, 2014, 1989, 1989, 2017, 2016, 2017, 199…
$ rating       <chr> NA, "13+", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA…
$ duration     <chr> "113 min", "110 min", "74 min", "69 min", "45 min", "52 m…
$ listed_in    <chr> "Comedy, Drama", "Drama, International", "Action, Drama, …
$ description  <chr> "A small fishing village must procure a local doctor to s…

df_correct_types <- df_renamed_column |> mutate(
  id = as.integer(parse_number(id)),
  type = as_factor(type),
  release_year = as.integer(release_year),
  duration = parse_number(duration)
)

Transform Data

Pivot dataset to long format by Genre

df_pivot_long_by_genre <- df_correct_types |> 
  mutate(genre = str_split(genre, ', ')) |> 
  unnest_longer(genre) |> 
  write_csv('amazon_prime_titles_pivot_long_by_genre.csv')

df_pivot_long_by_genre |>
  pull(genre) |> 
  head(10)

 [1] "Comedy"        "Drama"         "Drama"         "International"
 [5] "Action"        "Drama"         "Suspense"      "Documentary"  
 [9] "Drama"         "Fantasy"

df_pivot_long_by_genre |> 
  write_csv('amazon_prime_titles_pivot_long_by_genre.csv')

Pivot dataset to long format by Cast

df_pivot_long_by_cast <-  df_correct_types |> 
  mutate(cast = str_split(cast, ', ')) |> 
  unnest_longer(cast) |> 
  write_csv('amazon_prime_titles_pivot_long_by_cast.csv')

df_pivot_long_by_cast |> 
  pull(cast) |> 
  head(10)

 [1] "Brendan Gleeson"  "Taylor Kitsch"    "Gordon Pinsent"   "Mahesh Manjrekar"
 [5] "Abhay Mahajan"    "Sachin Khedekar"  "Tom Sizemore"     "Lorenzo Lamas"   
 [9] "Robert LaSardo"   "Richard Jones"

df_pivot_long_by_cast |> 
  write_csv('amazon_prime_titles_pivot_long_by_cast.csv')

Pivot dataset to wide by Genre

df_pivot_wide_by_genre <- df_correct_types |> 
  select(id, title, director, release_year, genre) |> 
  mutate(genre = strsplit(genre, ', ')) |> 
  unnest_longer(genre) |> 
  pivot_wider(names_from = genre,
              values_from = genre,
              values_fn = ~ !is.na(.),
              values_fill = FALSE,
              names_prefix = 'is_')

df_pivot_wide_by_genre |> 
  names()

 [1] "id"                           "title"                       
 [3] "director"                     "release_year"                
 [5] "is_Comedy"                    "is_Drama"                    
 [7] "is_International"             "is_Action"                   
 [9] "is_Suspense"                  "is_Documentary"              
[11] "is_Fantasy"                   "is_Kids"                     
[13] "is_Special Interest"          "is_Science Fiction"          
[15] "is_Adventure"                 "is_Horror"                   
[17] "is_Sports"                    "is_Talk Show and Variety"    
[19] "is_Anime"                     "is_Arts"                     
[21] "is_Entertainment"             "is_and Culture"              
[23] "is_TV Shows"                  "is_Animation"                
[25] "is_Music Videos and Concerts" "is_Fitness"                  
[27] "is_Faith and Spirituality"    "is_Military and War"         
[29] "is_Western"                   "is_LGBTQ"                    
[31] "is_Romance"                   "is_Unscripted"               
[33] "is_Young Adult Audience"      "is_Arthouse"                 
[35] "is_Historical"

df_pivot_wide_by_genre |> 
  write_csv('amazon_prime_titles_pivot_wide_by_genre.csv')

Analyze Data

Tableau Links

General Analysis (using cleaned csv):
Genre Analysis (using pivot by genre csv):