Analyzing Top 10.000 Most Popular Movies on IMDB
Finding the common characteristics among Top 10.000 movies of all time
EDA with SQL
Exploring Data
-- url title example
-- https://www.imdb.com/title/tt0177484/
-- preview titles
select * from title_basics tb
-- count of titles: 9,123,039
select count(*) from title_basics tb
-- count of each titleType
--| titleType | count | percentage |
--|---------------|-----------|------------|
--| tvEpisode | 6,967,265 | 76.37% |
--| short | 832,301 | 9.12% |
--| movie | 590,707 | 6.47% |
--| video | 251,682 | 2.76% |
--| tvSeries | 225,012 | 2.47% |
--| tvMovie | 127,122 | 1.39% |
--| tvMiniSeries | 46,600 | 0.51% |
--| tvSpecial | 41,436 | 0.45% |
--| videoGame | 32,570 | 0.36% |
--| tvShort | 8,343 | 0.09% |
--| tvPilot | 1 | 0.00% |
SELECT titleType,
COUNT(*) as count,
ROUND((COUNT(*) * 100.0 / 9123039), 2)
as percentage
FROM title_basics tb
GROUP BY titleType
ORDER BY count DESC;
-- count title in title_ratings: 1,470,769
select count(*) from title_ratings tr;
-- count of title that has ratings: 1,236,363
select count(*)
from title_basics tb
join title_ratings tr
on tb.tconst = tr.tconst;
-- find missing titles
-- found tt0187178 that redirects to tt0177484 which exists in tb
-- found tt0253688, tt0253697 that actually exist with matching rating
select * from title_ratings tr
left join title_basics tb
on tr.tconst = tb.tconst
where tb.tconst is null
-- movie counts: 590,707
select count(*) from title_basics tb
where titleType = 'movie';
--newest movies
select * from title_basics tb
where titleType = 'movie' and startYear != '\N'
order by startYear desc
--oldest movies
select * from title_basics tb
where titleType = 'movie' and startYear != '\N'
order by startYear
-- movie that has ratings: 279,122
select count(*)
from title_basics tb
join title_ratings tr
on tb.tconst = tr.tconst
where tb.titleType = 'movie';
-- check sample size for analysis target:
--- Top 10.000 Movies based on Num Votes and Ratings
----- Filter release year too?
----- based on numVotes
select *
from title_basics tb
inner join title_ratings tr
on tb.tconst = tr.tconst
where titleType = 'movie'
order by tr.numVotes desc
limit 10000
----- sort by numVotes & average must be 6.8 or higher
select *
from title_basics tb
inner join title_ratings tr
on tb.tconst = tr.tconst
where titleType = 'movie' and averageRating >= 7.0
order by tr.numVotes desc
limit 10000
Extracting Data
-- join title basics with title ratings for finding most popoular movies ordered by number of votes
select tb.tconst, primaryTitle, genres, startYear, runtimeMinutes, averageRating, numVotes
from title_basics tb
inner join title_ratings tr
on tb.tconst = tr.tconst
where titleType = 'movie'
order by numVotes desc
limit 10000