In [1]:
#подгрузим основные библиотеки
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import datetime as dt
from time import strptime
import math as mth
from plotly import graph_objects as go 
In [2]:
#загрузим файл
data = pd.read_excel('netflix_titles_2021.xlsx')
In [3]:
#выведем первые 5 значений
display(data.head())
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States 09.25.2021 2020.0 PG-13 90 min Documentaries As her father nears the end of his life, filmm...
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa 09.24.2021 2021.0 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town t...
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN 09.24.2021 2021.0 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... To protect his family from a powerful drug lor...
3 s4 TV Show Jailbirds New Orleans NaN NaN NaN 09.24.2021 2021.0 TV-MA 1 Season Docuseries, Reality TV Feuds, flirtations and toilet talk go down amo...
4 s5 TV Show Kota Factory NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India 09.24.2021 2021.0 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ... In a city of coaching centers known to train I...
In [4]:
#рассмотрим подробнее
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   show_id       8807 non-null   object 
 1   type          8807 non-null   object 
 2   title         8807 non-null   object 
 3   director      6173 non-null   object 
 4   cast          7982 non-null   object 
 5   country       7976 non-null   object 
 6   date_added    8797 non-null   object 
 7   release_year  8807 non-null   float64
 8   rating        8803 non-null   object 
 9   duration      8804 non-null   object 
 10  listed_in     8807 non-null   object 
 11  description   8807 non-null   object 
dtypes: float64(1), object(11)
memory usage: 825.8+ KB
In [5]:
#изменим тим данных у дат
data['date_added'] = pd.to_datetime(data['date_added'], format='%m.%d.%Y')
data['release_year'] = data['release_year'].astype('int')
In [6]:
#проверим
data.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8807 non-null   object        
 1   type          8807 non-null   object        
 2   title         8807 non-null   object        
 3   director      6173 non-null   object        
 4   cast          7982 non-null   object        
 5   country       7976 non-null   object        
 6   date_added    8797 non-null   datetime64[ns]
 7   release_year  8807 non-null   int32         
 8   rating        8803 non-null   object        
 9   duration      8804 non-null   object        
 10  listed_in     8807 non-null   object        
 11  description   8807 non-null   object        
dtypes: datetime64[ns](1), int32(1), object(10)
memory usage: 791.4+ KB
In [7]:
display(data.head())
show_id type title director cast country date_added release_year rating duration listed_in description
0 s1 Movie Dick Johnson Is Dead Kirsten Johnson NaN United States 2021-09-25 2020 PG-13 90 min Documentaries As her father nears the end of his life, filmm...
1 s2 TV Show Blood & Water NaN Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban... South Africa 2021-09-24 2021 TV-MA 2 Seasons International TV Shows, TV Dramas, TV Mysteries After crossing paths at a party, a Cape Town t...
2 s3 TV Show Ganglands Julien Leclercq Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi... NaN 2021-09-24 2021 TV-MA 1 Season Crime TV Shows, International TV Shows, TV Act... To protect his family from a powerful drug lor...
3 s4 TV Show Jailbirds New Orleans NaN NaN NaN 2021-09-24 2021 TV-MA 1 Season Docuseries, Reality TV Feuds, flirtations and toilet talk go down amo...
4 s5 TV Show Kota Factory NaN Mayur More, Jitendra Kumar, Ranjan Raj, Alam K... India 2021-09-24 2021 TV-MA 2 Seasons International TV Shows, Romantic TV Shows, TV ... In a city of coaching centers known to train I...
In [8]:
#рассмотрим колонку рейтинг
print(data['rating'].value_counts())
#видим, что в рейтинге похоже стоит длительность фильма
TV-MA       3207
TV-14       2160
TV-PG        863
R            799
PG-13        490
TV-Y7        334
TV-Y         307
PG           287
TV-G         220
NR            80
G             41
TV-Y7-FV       6
NC-17          3
UR             3
74 min         1
84 min         1
66 min         1
Name: rating, dtype: int64
In [9]:
display(data.query('rating=="74 min"'))
display(data.query('rating=="84 min"'))
display(data.query('rating=="66 min"'))
#видим, что везде не хватает длительности и похоже действительно случайно данные поместили в столбец рейтинга
show_id type title director cast country date_added release_year rating duration listed_in description
5541 s5542 Movie Louis C.K. 2017 Louis C.K. Louis C.K. United States 2017-04-04 2017 74 min NaN Movies Louis C.K. muses on religion, eternal love, gi...
show_id type title director cast country date_added release_year rating duration listed_in description
5794 s5795 Movie Louis C.K.: Hilarious Louis C.K. Louis C.K. United States 2016-09-16 2010 84 min NaN Movies Emmy-winning comedy writer Louis C.K. brings h...
show_id type title director cast country date_added release_year rating duration listed_in description
5813 s5814 Movie Louis C.K.: Live at the Comedy Store Louis C.K. Louis C.K. United States 2016-08-15 2015 66 min NaN Movies The comic puts his trademark hilarious/thought...
In [10]:
#гуглим эти три картины и вписываем нужный результат
data.loc[data['rating'] == '74 min', 'rating'] = 'TV-MA'
data.loc[data['rating'] == '84 min', 'rating'] = 'TV-MA'
data.loc[data['rating'] == '66 min', 'rating'] = 'TV-MA'
In [11]:
print(len(data[data['director'].isnull()]))
print(len(data[data['cast'].isnull()]))
print(len(data[data['country'].isnull()]))
print(len(data[data['date_added'].isnull()]))
print(len(data[data['rating'].isnull()]))
print(len(data[data['duration'].isnull()]))
2634
825
831
10
4
3
In [12]:
#выведем пустые значения колонки ratings
data[data['rating'].isnull()]
Out[12]:
show_id type title director cast country date_added release_year rating duration listed_in description
5989 s5990 Movie 13TH: A Conversation with Oprah Winfrey & Ava ... NaN Oprah Winfrey, Ava DuVernay NaN 2017-01-26 2017 NaN 37 min Movies Oprah Winfrey sits down with director Ava DuVe...
6827 s6828 TV Show Gargantia on the Verdurous Planet NaN Kaito Ishikawa, Hisako Kanemoto, Ai Kayano, Ka... Japan 2016-12-01 2013 NaN 1 Season Anime Series, International TV Shows After falling through a wormhole, a space-dwel...
7312 s7313 TV Show Little Lunch NaN Flynn Curry, Olivia Deeble, Madison Lu, Oisín ... Australia 2018-02-01 2015 NaN 1 Season Kids' TV, TV Comedies Adopting a child's perspective, this show take...
7537 s7538 Movie My Honor Was Loyalty Alessandro Pepe Leone Frisa, Paolo Vaccarino, Francesco Miglio... Italy 2017-03-01 2015 NaN 115 min Dramas Amid the chaos and horror of World War II, a c...
In [13]:
#выведем пустые значения колонки duration
data[data['duration'].isnull()]
Out[13]:
show_id type title director cast country date_added release_year rating duration listed_in description
5541 s5542 Movie Louis C.K. 2017 Louis C.K. Louis C.K. United States 2017-04-04 2017 TV-MA NaN Movies Louis C.K. muses on religion, eternal love, gi...
5794 s5795 Movie Louis C.K.: Hilarious Louis C.K. Louis C.K. United States 2016-09-16 2010 TV-MA NaN Movies Emmy-winning comedy writer Louis C.K. brings h...
5813 s5814 Movie Louis C.K.: Live at the Comedy Store Louis C.K. Louis C.K. United States 2016-08-15 2015 TV-MA NaN Movies The comic puts his trademark hilarious/thought...
In [14]:
#выведем гистограмму ratings
data['rating'].hist(bins=15, range=(2,12));
plt.xticks(rotation='vertical');
plt.show();
In [15]:
#добавим год(year)
data['year'] = pd.DatetimeIndex(data['date_added']).year 
In [16]:
#посчитаем количество фильмов по годам
data_pivot = data.pivot_table(values='show_id', index=['year'], aggfunc={'count'})
In [17]:
#выведем
print(data_pivot);
        count
year         
2008.0      2
2009.0      2
2010.0      1
2011.0     13
2012.0      3
2013.0     11
2014.0     24
2015.0     82
2016.0    429
2017.0   1188
2018.0   1649
2019.0   2016
2020.0   1879
2021.0   1498
In [18]:
#выведем через график
data_pivot.plot(title='Общее количество фильмов по годам', style='o-', grid=True);
In [19]:
#посчитаем (сериал и фильм)
data_sum = data.groupby('type').agg({'year': 'count'}).sort_values(by='year', ascending=False).reset_index()
data_sum.columns = ['Тип', 'Количество']
display(data_sum)
Тип Количество
0 Movie 6131
1 TV Show 2666
In [20]:
#вывод через круговую диаграмму
fig = go.Figure(data=[go.Pie(labels=data_sum['Тип'], values=data_sum['Количество'])])
fig.update_layout(title={'text':'Количество фильмов и сериалов в %'})
fig.show()
In [21]:
#посчитаем количество стране, где сняли менее 10 картин за год
data_country = data.groupby('country').agg({'year': 'count'}).sort_values(by='year', ascending=False).reset_index()
data_country.columns = ['Страна', 'Количество']
print(len(data_country.query('Количество < 10')))
690
In [22]:
#сохраним первые 25
data_country = data_country.head(25)
In [23]:
#выведем первые 25
display(data_country)
Страна Количество
0 United States 2812
1 India 972
2 United Kingdom 418
3 Japan 244
4 South Korea 199
5 Canada 181
6 Spain 145
7 France 124
8 Mexico 110
9 Egypt 106
10 Turkey 105
11 Nigeria 95
12 Australia 86
13 Taiwan 81
14 Indonesia 79
15 Brazil 77
16 United Kingdom, United States 75
17 Philippines 75
18 United States, Canada 73
19 Germany 67
20 China 66
21 Thailand 61
22 Argentina 56
23 Hong Kong 53
24 United States, United Kingdom 47
In [24]:
#выведем через график
plt.figure(figsize=(12,4))
plt.title('Количество фильмов по странам')
plt.xticks(rotation='vertical');
sns.barplot(data=data_country, x='Страна', y='Количество')
plt.show()