Data Analysis Workout 09: Netflix Analysis

Level of Difficulty:

Objective: This workout provides practice in exploratory data analysis on Netflix data.

Download the dataset here.

Challenge Questions:

  1. Is there any missing data? Deal with them accordingly.
  2. Using the ‘date_added’ column a new column called ‘year_added’ that only has the year the title was added.
  3. Using the ‘date_added’ column a new column called ‘month_added’ that only has the month the title was added.
  4. Check the data types. Anything look odd? Adjust accordingly.
  5. What is the most popular release year for movies on Netflix?
  6. What year did Netflix add the most content to its platform?
  7. What is the movie with the longest title in the dataset?
  8. What are the top 5 most popular movie genres?
  9. Create a pie chart visualizing the proportion of movies vs TV shows. Label each section with the percentage.

Simply post your code and a screenshot of your results.

Please format your Python code and blur it or place it in a hidden section.

This workout will be released on Monday May 23 2023, and the author’s solution will be posted on Sunday May 28, 2023.

1 Like

Hi @kedeisha1 ,

Thank you for the workout.
I couldn’t download the source file.
Will you help?

Right click anywhere and hit “Save As”. It saves as a .csv

This is not the best solution probably. Would be thankful for your comments. And if possible, please add the solution, as it is already 30.05.2023 :blush:

from google.colab import files
uploaded = files.upload()

import pandas as pd
import io
import matplotlib.pyplot as plt
from prettytable import PrettyTable

# Read the uploaded file into a DataFrame
df = pd.read_csv(io.BytesIO(uploaded['netflix_titles.txt']))

# Check for missing data and deal with them
print(df.isnull().sum())

1

# Convert 'date_added' to datetime type
df['date_added'] = pd.to_datetime(df['date_added'])

# Create the new columns
df['year_added'] = df['date_added'].dt.year
df['month_added'] = df['date_added'].dt.month
print(df.dtypes)

2

# Fill missing values with -1 (indicating missing data)
df['year_added'].fillna(-1, inplace=True)
df['month_added'].fillna(-1, inplace=True)

# Convert columns to integers
df['year_added'] = df['year_added'].astype(int)
df['month_added'] = df['month_added'].astype(int)

# What is the most popular release year for movies on Netflix?
# What year did Netflix add the most content to its platform?
# What is the movie with the longest title in the dataset?
popular_release_year = df[df['type'] == 'Movie']['release_year'].value_counts().idxmax()
most_content_year = df['year_added'].value_counts().idxmax()
longest_title_movie = df[df['type'] == 'Movie']['title'].str.len().idxmax()

# Specify the Column Names while initializing the Table
summary_table = PrettyTable(["Information", "Result"])

# Add rows
summary_table.add_row(["Most popular release year for movies", popular_release_year])
summary_table.add_row(["Year when Netflix added the most content", most_content_year])
summary_table.add_row(["Movie with the longest title", df.loc[longest_title_movie, 'title']])
print(summary_table)

# What are the top 5 most popular movie genres?
# Create a DataFrame of the top 5 most popular movie genres.
genres_df = pd.DataFrame(df[df['type'] == 'Movie']['listed_in'].str.split(', ', expand=True).stack().value_counts().nlargest(5), columns=['Count'])
genres_df = genres_df.reset_index()
genres_df.rename(columns = {'index':'Genre'}, inplace = True)

# Initialize the PrettyTable
genres_table = PrettyTable(["Genre", "Count"])

# Add rows to the table using a for loop
for index, value in genres_df.iterrows():
     genres_table.add_row([value['Genre'], value['Count']])

print("The top 5 most popular movie genres are:")
print(genres_table)

4

# Create a pie chart visualizing the proportion of movies vs TV shows. Label each section with the percentage.
content_type_counts = df['type'].value_counts()
plt.pie(content_type_counts, labels = content_type_counts.index, autopct='%1.1f%%')
plt.title('Proportion of Movies vs TV Shows')
plt.show()

5

Data_Analysis_Workout_09_Netflix_Analysis.ipynb (42.7 KB)

NETFLIX DATA.ipynb (57.9 KB)


[NETFLIX DATA-checkpoint.ipynb|attachment](upload://v0D51Ll2MNbBNuxgBXYV5OHin8U.ipynb) (57.2 KB)

import pandas as pd

# Load the dataset
url = 'https://raw.githubusercontent.com/kedeisha1/Challenges/main/netflix_titles.csv'
df = pd.read_csv(url)

# Check for missing data
missing_data = df.isnull().sum()
print(missing_data)

# Create 'year_added' column
df['year_added'] = pd.to_datetime(df['date_added']).dt.year

# Create 'month_added' column
df['month_added'] = pd.to_datetime(df['date_added']).dt.month

# Check data types
print(df.dtypes)

# Filter movies only
movies_df = df[df['type'] == 'Movie']

# Count occurrences of each release year
popular_release_year = movies_df['release_year'].value_counts().idxmax()

print("The most popular release year for movies on Netflix is:", popular_release_year)

# Count occurrences of each year_added
most_content_year = df['year_added'].value_counts().idxmax()

print("The year when Netflix added the most content is:", most_content_year)

# Find the movie with the longest title
longest_title_movie = df.loc[df[df['type'] == 'Movie']['title'].str.len().idxmax()]['title']

print("The movie with the longest title is:", longest_title_movie)

# Split the genres column and count occurrences
genres_counts = df['listed_in'].str.split(', ').explode().value_counts()

top_5_genres = genres_counts.head(5)

print("Top 5 most popular movie genres:")
print(top_5_genres)

import matplotlib.pyplot as plt

# Count the occurrences of movies and TV shows
type_counts = df['type'].value_counts()

# Create the pie chart
plt.pie(type_counts, labels=type_counts.index, autopct='%1.1f%%')
plt.axis('equal')
plt.title('Proportion of Movies vs TV Shows')
plt.show()
Nice workout! I tried a few different visuals and decided to go with a lollipop chart for the fun of it:


[details="Summary"]
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

# load the dataset

url = 'https://raw.githubusercontent.com/kedeisha1/Challenges/main/netflix_titles.csv'

df = pd.read_csv(url)

print(df)

df.head()

# %%

df.info()

df.isna().sum()

# %%

df = df.dropna()

df.isna().sum()

# %%

df.info()

# %%

df['date_added'] = pd.to_datetime(df['date_added'])

df['release_year'] = pd.to_datetime(df['release_year'], format='%Y').dt.year

# %%

df.dtypes

# %%

df['month_added'] = df['date_added'].dt.month_name()

df.head()

# %%

release_year_ct = df.groupby('release_year')['release_year'].count().sort_values(ascending=False)

release_year_ct

# %%

df['year_added'] = df['date_added'].dt.year

df.head()

# %%

year_added_ct = df.groupby('year_added')['year_added'].count().sort_values(ascending=False)

year_added_ct

# %%

movie_title_len = df.loc[df['type'] == 'Movie', 'title_len'] = df.loc[df['type'] == 'Movie', 'title'].str.len()

movie_title_len = df.loc[df['type'] == 'Movie', ['title', 'title_len']]

movie_title_len.sort_values(by='title_len', ascending=False)

# %%

genre_split = df['listed_in'].apply(lambda x: str(x).split(', ') if isinstance(x, str) else [])

genre_split_series = pd.Series([item for sublist in genre_split for item in sublist])

genre_count = genre_split_series.value_counts()

print(genre_count)

# %%

top_genres = genre_count.head(5)

plt.stem(range(len(top_genres)), genre_count.head(5))

plt.xticks(range(len(top_genres)), top_genres.index, rotation = 45)

plt.title("Netflix's top 5 moive genres")

plt.ylabel("Movie Count")

for i in range(len(genre_count.head(5))):

plt.text(i, genre_count[i], genre_count[i], ha='center', va='bottom')

plt.show()
[/details]