#подгрузим основные библиотеки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from time import strptime
import math as mth
from plotly import graph_objects as go
#загрузим файл
data = pd.read_excel('netflix_titles_2021.xlsx')
#выведем первые 5 значений
display(data.head())
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | 09.25.2021 | 2020.0 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | 09.24.2021 | 2021.0 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | 09.24.2021 | 2021.0 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | 09.24.2021 | 2021.0 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | 09.24.2021 | 2021.0 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
#рассмотрим подробнее
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8807 entries, 0 to 8806 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 8807 non-null object 1 type 8807 non-null object 2 title 8807 non-null object 3 director 6173 non-null object 4 cast 7982 non-null object 5 country 7976 non-null object 6 date_added 8797 non-null object 7 release_year 8807 non-null float64 8 rating 8803 non-null object 9 duration 8804 non-null object 10 listed_in 8807 non-null object 11 description 8807 non-null object dtypes: float64(1), object(11) memory usage: 825.8+ KB
#изменим тим данных у дат
data['date_added'] = pd.to_datetime(data['date_added'], format='%m.%d.%Y')
data['release_year'] = data['release_year'].astype('int')
#проверим
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 8807 entries, 0 to 8806 Data columns (total 12 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 show_id 8807 non-null object 1 type 8807 non-null object 2 title 8807 non-null object 3 director 6173 non-null object 4 cast 7982 non-null object 5 country 7976 non-null object 6 date_added 8797 non-null datetime64[ns] 7 release_year 8807 non-null int32 8 rating 8803 non-null object 9 duration 8804 non-null object 10 listed_in 8807 non-null object 11 description 8807 non-null object dtypes: datetime64[ns](1), int32(1), object(10) memory usage: 791.4+ KB
display(data.head())
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | s1 | Movie | Dick Johnson Is Dead | Kirsten Johnson | NaN | United States | 2021-09-25 | 2020 | PG-13 | 90 min | Documentaries | As her father nears the end of his life, filmm... |
1 | s2 | TV Show | Blood & Water | NaN | Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... | South Africa | 2021-09-24 | 2021 | TV-MA | 2 Seasons | International TV Shows, TV Dramas, TV Mysteries | After crossing paths at a party, a Cape Town t... |
2 | s3 | TV Show | Ganglands | Julien Leclercq | Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... | NaN | 2021-09-24 | 2021 | TV-MA | 1 Season | Crime TV Shows, International TV Shows, TV Act... | To protect his family from a powerful drug lor... |
3 | s4 | TV Show | Jailbirds New Orleans | NaN | NaN | NaN | 2021-09-24 | 2021 | TV-MA | 1 Season | Docuseries, Reality TV | Feuds, flirtations and toilet talk go down amo... |
4 | s5 | TV Show | Kota Factory | NaN | Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... | India | 2021-09-24 | 2021 | TV-MA | 2 Seasons | International TV Shows, Romantic TV Shows, TV ... | In a city of coaching centers known to train I... |
#рассмотрим колонку рейтинг
print(data['rating'].value_counts())
#видим, что в рейтинге похоже стоит длительность фильма
TV-MA 3207 TV-14 2160 TV-PG 863 R 799 PG-13 490 TV-Y7 334 TV-Y 307 PG 287 TV-G 220 NR 80 G 41 TV-Y7-FV 6 NC-17 3 UR 3 74 min 1 84 min 1 66 min 1 Name: rating, dtype: int64
display(data.query('rating=="74 min"'))
display(data.query('rating=="84 min"'))
display(data.query('rating=="66 min"'))
#видим, что везде не хватает длительности и похоже действительно случайно данные поместили в столбец рейтинга
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5541 | s5542 | Movie | Louis C.K. 2017 | Louis C.K. | Louis C.K. | United States | 2017-04-04 | 2017 | 74 min | NaN | Movies | Louis C.K. muses on religion, eternal love, gi... |
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5794 | s5795 | Movie | Louis C.K.: Hilarious | Louis C.K. | Louis C.K. | United States | 2016-09-16 | 2010 | 84 min | NaN | Movies | Emmy-winning comedy writer Louis C.K. brings h... |
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5813 | s5814 | Movie | Louis C.K.: Live at the Comedy Store | Louis C.K. | Louis C.K. | United States | 2016-08-15 | 2015 | 66 min | NaN | Movies | The comic puts his trademark hilarious/thought... |
#гуглим эти три картины и вписываем нужный результат
data.loc[data['rating'] == '74 min', 'rating'] = 'TV-MA'
data.loc[data['rating'] == '84 min', 'rating'] = 'TV-MA'
data.loc[data['rating'] == '66 min', 'rating'] = 'TV-MA'
print(len(data[data['director'].isnull()]))
print(len(data[data['cast'].isnull()]))
print(len(data[data['country'].isnull()]))
print(len(data[data['date_added'].isnull()]))
print(len(data[data['rating'].isnull()]))
print(len(data[data['duration'].isnull()]))
2634 825 831 10 4 3
#выведем пустые значения колонки ratings
data[data['rating'].isnull()]
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5989 | s5990 | Movie | 13TH: A Conversation with Oprah Winfrey & Ava ... | NaN | Oprah Winfrey, Ava DuVernay | NaN | 2017-01-26 | 2017 | NaN | 37 min | Movies | Oprah Winfrey sits down with director Ava DuVe... |
6827 | s6828 | TV Show | Gargantia on the Verdurous Planet | NaN | Kaito Ishikawa, Hisako Kanemoto, Ai Kayano, Ka... | Japan | 2016-12-01 | 2013 | NaN | 1 Season | Anime Series, International TV Shows | After falling through a wormhole, a space-dwel... |
7312 | s7313 | TV Show | Little Lunch | NaN | Flynn Curry, Olivia Deeble, Madison Lu, Oisín ... | Australia | 2018-02-01 | 2015 | NaN | 1 Season | Kids' TV, TV Comedies | Adopting a child's perspective, this show take... |
7537 | s7538 | Movie | My Honor Was Loyalty | Alessandro Pepe | Leone Frisa, Paolo Vaccarino, Francesco Miglio... | Italy | 2017-03-01 | 2015 | NaN | 115 min | Dramas | Amid the chaos and horror of World War II, a c... |
#выведем пустые значения колонки duration
data[data['duration'].isnull()]
show_id | type | title | director | cast | country | date_added | release_year | rating | duration | listed_in | description | |
---|---|---|---|---|---|---|---|---|---|---|---|---|
5541 | s5542 | Movie | Louis C.K. 2017 | Louis C.K. | Louis C.K. | United States | 2017-04-04 | 2017 | TV-MA | NaN | Movies | Louis C.K. muses on religion, eternal love, gi... |
5794 | s5795 | Movie | Louis C.K.: Hilarious | Louis C.K. | Louis C.K. | United States | 2016-09-16 | 2010 | TV-MA | NaN | Movies | Emmy-winning comedy writer Louis C.K. brings h... |
5813 | s5814 | Movie | Louis C.K.: Live at the Comedy Store | Louis C.K. | Louis C.K. | United States | 2016-08-15 | 2015 | TV-MA | NaN | Movies | The comic puts his trademark hilarious/thought... |
#выведем гистограмму ratings
data['rating'].hist(bins=15, range=(2,12));
plt.xticks(rotation='vertical');
plt.show();
#добавим год(year)
data['year'] = pd.DatetimeIndex(data['date_added']).year
#посчитаем количество фильмов по годам
data_pivot = data.pivot_table(values='show_id', index=['year'], aggfunc={'count'})
#выведем
print(data_pivot);
count year 2008.0 2 2009.0 2 2010.0 1 2011.0 13 2012.0 3 2013.0 11 2014.0 24 2015.0 82 2016.0 429 2017.0 1188 2018.0 1649 2019.0 2016 2020.0 1879 2021.0 1498
#выведем через график
data_pivot.plot(title='Общее количество фильмов по годам', style='o-', grid=True);
#посчитаем (сериал и фильм)
data_sum = data.groupby('type').agg({'year': 'count'}).sort_values(by='year', ascending=False).reset_index()
data_sum.columns = ['Тип', 'Количество']
display(data_sum)
Тип | Количество | |
---|---|---|
0 | Movie | 6131 |
1 | TV Show | 2666 |
#вывод через круговую диаграмму
fig = go.Figure(data=[go.Pie(labels=data_sum['Тип'], values=data_sum['Количество'])])
fig.update_layout(title={'text':'Количество фильмов и сериалов в %'})
fig.show()
#посчитаем количество стране, где сняли менее 10 картин за год
data_country = data.groupby('country').agg({'year': 'count'}).sort_values(by='year', ascending=False).reset_index()
data_country.columns = ['Страна', 'Количество']
print(len(data_country.query('Количество < 10')))
690
#сохраним первые 25
data_country = data_country.head(25)
#выведем первые 25
display(data_country)
Страна | Количество | |
---|---|---|
0 | United States | 2812 |
1 | India | 972 |
2 | United Kingdom | 418 |
3 | Japan | 244 |
4 | South Korea | 199 |
5 | Canada | 181 |
6 | Spain | 145 |
7 | France | 124 |
8 | Mexico | 110 |
9 | Egypt | 106 |
10 | Turkey | 105 |
11 | Nigeria | 95 |
12 | Australia | 86 |
13 | Taiwan | 81 |
14 | Indonesia | 79 |
15 | Brazil | 77 |
16 | United Kingdom, United States | 75 |
17 | Philippines | 75 |
18 | United States, Canada | 73 |
19 | Germany | 67 |
20 | China | 66 |
21 | Thailand | 61 |
22 | Argentina | 56 |
23 | Hong Kong | 53 |
24 | United States, United Kingdom | 47 |
#выведем через график
plt.figure(figsize=(12,4))
plt.title('Количество фильмов по странам')
plt.xticks(rotation='vertical');
sns.barplot(data=data_country, x='Страна', y='Количество')
plt.show()