import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Load the dataset
df = pd.read_csv("US_Accidents_Dec21.csv")
# Check for missing data
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0]
# Visualize missing data
plt.figure(figsize=(12, 6))
missing_data.sort_values(ascending=False).plot(kind="bar")
plt.title("Missing Data")
plt.xlabel("Columns")
plt.ylabel("Missing Values Count")
plt.show()
# Remove columns not useful for analysis
columns_to_drop = ["TMC", "End_Lat", "End_Lng", "Description", "Number", "Wind_Chill(F)", "Precipitation(in)"]
df = df.drop(columns_to_drop, axis=1)
# Plot top 10 cities with the most accidents
top_10_cities = df['City'].value_counts().nlargest(10)
plt.figure(figsize=(12, 6))
sns.barplot(x=top_10_cities.index, y=top_10_cities.values, palette="viridis")
plt.title("Top 10 Cities with the Most Accidents")
plt.xlabel("City")
plt.ylabel("Number of Accidents")
plt.xticks(rotation=45)
plt.show()
# Plot distribution of start time
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
plt.figure(figsize=(12, 6))
sns.histplot(data=df, x='Start_Time', bins=24, kde=True)
plt.title("Distribution of Start Time")
plt.xlabel("Hour of the Day")
plt.ylabel("Number of Accidents")
plt.show()
# Check distribution of accidents by hour on weekdays vs. weekends
df['Weekday'] = df['Start_Time'].dt.weekday
weekday_hourly_distribution = df[df['Weekday'] < 5]['Start_Time'].dt.hour.value_counts(normalize=True)
weekend_hourly_distribution = df[df['Weekday'] >= 5]['Start_Time'].dt.hour.value_counts(normalize=True)
# Plot distribution of accidents by hour on weekdays vs. weekends
plt.figure(figsize=(12, 6))
sns.lineplot(x=weekday_hourly_distribution.index, y=weekday_hourly_distribution.values, label='Weekday')
sns.lineplot(x=weekend_hourly_distribution.index, y=weekend_hourly_distribution.values, label='Weekend')
plt.title("Distribution of Accidents by Hour (Weekdays vs. Weekends)")
plt.xlabel("Hour of the Day")
plt.ylabel("Proportion of Accidents")
plt.legend()
plt.show()
# Plot distribution of start time on Sundays
sunday_start_time = df[df['Start_Time'].dt.weekday == 6]['Start_Time']
plt.figure(figsize=(12, 6))
sns.histplot(data=sunday_start_time, x='Start_Time', bins=24, kde=True)
plt.title("Distribution of Start Time on Sundays")
plt.xlabel("Hour of the Day")
plt.ylabel("Number of Accidents")
plt.show()
# Create geographical plot to show accidents across the United States
plt.figure(figsize=(12, 8))
sns.scatterplot(data=df, x='Start_Lng', y='Start_Lat', alpha=0.2, hue='State')
plt.title("Accidents Across the United States")
plt.xlabel("Longitude")
plt.ylabel("Latitude")
plt.show()
# Find states belonging to the top 100 cities with the most accidents
top_100_cities = df['City'].value_counts().nlargest(100).index
top_100_states = df[df['City'].isin(top_100_cities)]['State'].value_counts()
# Plot states belonging to the top 100 cities with the most accidents
plt.figure(figsize=(12, 6))
sns.barplot(x=top_100_states.index, y=top_100_states.values, palette="viridis")
plt.title("States with the Most Accidents among Top 100 Cities")
plt.xlabel("State")
plt.ylabel("Number of Accidents")
plt.xticks(rotation=45)
plt.show()
# Find months with the most accidents
df['Start_Time'] = pd.to_datetime(df['Start_Time'])
df['Month'] = df['Start_Time'].dt.month
monthly_accidents = df['Month'].value_counts()
# Plot months with the most accidents
plt.figure(figsize=(12, 6))
sns.barplot(x=monthly_accidents.index, y=monthly_accidents.values, palette="viridis")
plt.title("Months with the Most Accidents")
plt.xlabel("Month")
plt.ylabel("Number of Accidents")
plt.show()
# Analyze trend of accidents year over year
df['Year'] = df['Start_Time'].dt.year
yearly_accidents = df['Year'].value_counts().sort_index()
# Plot trend of accidents year over year
plt.figure(figsize=(12, 6))
sns.lineplot(x=yearly_accidents.index, y=yearly_accidents.values)
plt.title("Trend of Accidents Year over Year")
plt.xlabel("Year")
plt.ylabel("Number of Accidents")
plt.show()
# Check if the state of New York is in the data
is_new_york_present = 'NY' in df['State'].unique()
print("Is New York present in the data?", is_new_york_present)
# Your own question: What is the average duration of accidents?
df['Duration'] = df['End_Time'] - df['Start_Time']
average_duration = df['Duration'].mean()
print("Average Duration of Accidents:", average_duration)