import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
Data Analysis Challenge 06 - Roller Coaster Analysis
= "https://raw.githubusercontent.com/kedeisha1/Challenges/main/coaster_db.csv"
url= pd.read_csv(url) df
10) df.head(
coaster_name | Length | Speed | Location | Status | Opening date | Type | Manufacturer | Height restriction | Model | ... | speed1 | speed2 | speed1_value | speed1_unit | speed_mph | height_value | height_unit | height_ft | Inversions_clean | Gforce_clean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | Switchback Railway | 600 ft (180 m) | 6 mph (9.7 km/h) | Coney Island | Removed | June 16, 1884 | Wood | LaMarcus Adna Thompson | NaN | Lift Packed | ... | 6 mph | 9.7 km/h | 6.0 | mph | 6.0 | 50.0 | ft | NaN | 0 | 2.9 |
1 | Flip Flap Railway | NaN | NaN | Sea Lion Park | Removed | 1895 | Wood | Lina Beecher | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | 12.0 |
2 | Switchback Railway (Euclid Beach Park) | NaN | NaN | Cleveland, Ohio, United States | Closed | NaN | Other | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN |
3 | Loop the Loop (Coney Island) | NaN | NaN | Other | Removed | 1901 | Steel | Edwin Prescott | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | NaN |
4 | Loop the Loop (Young's Pier) | NaN | NaN | Other | Removed | 1901 | Steel | Edwin Prescott | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 1 | NaN |
5 | Cannon Coaster | NaN | NaN | Coney Island | Removed | 1902 | Wood | George Francis Meyer | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 40.0 | ft | NaN | 0 | NaN |
6 | Leap-The-Dips | 1,452 ft (443 m) | 10 mph (16 km/h) | Lakemont Park | Operating | 1902 | Wood – Side friction | Federal Construction Company | NaN | NaN | ... | 10 mph | 16 km/h | 10.0 | mph | 10.0 | 41.0 | ft | NaN | 0 | NaN |
7 | Figure Eight (Euclid Beach Park) | NaN | NaN | Cleveland, Ohio, United States | Closed | NaN | Other | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN |
8 | Drop the Dip | NaN | NaN | Coney Island | Removed | June 6, 1907 | Other | Arthur Jarvis | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | 60.0 | ft | NaN | 0 | NaN |
9 | Scenic Railway (Euclid Beach Park) | NaN | NaN | Cleveland, Ohio, United States | Closed | NaN | Other | NaN | NaN | NaN | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | 0 | NaN |
10 rows × 56 columns
1. How many columns and rows are in the dataset?
= df.shape
shape print(f"Dataframe has {shape[0]} rows and {shape[1]} columns")
Dataframe has 1087 rows and 56 columns
2. Is there any missing data?
df.isna()
coaster_name | Length | Speed | Location | Status | Opening date | Type | Manufacturer | Height restriction | Model | ... | speed1 | speed2 | speed1_value | speed1_unit | speed_mph | height_value | height_unit | height_ft | Inversions_clean | Gforce_clean | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | False | False | False | False | False | False | False | False | True | False | ... | False | False | False | False | False | False | False | True | False | False |
1 | False | True | True | False | False | False | False | False | True | True | ... | True | True | True | True | True | True | True | True | False | False |
2 | False | True | True | False | False | True | False | True | True | True | ... | True | True | True | True | True | True | True | True | False | True |
3 | False | True | True | False | False | False | False | False | True | True | ... | True | True | True | True | True | True | True | True | False | True |
4 | False | True | True | False | False | False | False | False | True | True | ... | True | True | True | True | True | True | True | True | False | True |
... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
1082 | False | False | False | False | True | True | False | False | False | True | ... | False | False | False | False | False | False | False | True | False | False |
1083 | False | False | False | False | False | False | False | False | True | False | ... | False | False | False | False | False | False | False | True | False | True |
1084 | False | False | False | False | True | False | False | False | False | False | ... | False | False | False | False | False | False | False | True | False | False |
1085 | False | False | False | False | False | True | False | False | True | False | ... | False | False | False | False | False | False | False | True | False | True |
1086 | False | False | False | False | False | False | False | False | True | False | ... | False | False | False | False | False | False | False | True | False | True |
1087 rows × 56 columns
3. Display the summary statistics of the numeric columns using the describe method.
df.describe()
Inversions | year_introduced | latitude | longitude | speed1_value | speed_mph | height_value | height_ft | Inversions_clean | Gforce_clean | |
---|---|---|---|---|---|---|---|---|---|---|
count | 932.000000 | 1087.000000 | 812.000000 | 812.000000 | 937.000000 | 937.000000 | 965.000000 | 171.000000 | 1087.000000 | 362.000000 |
mean | 1.547210 | 1994.986201 | 38.373484 | -41.595373 | 53.850374 | 48.617289 | 89.575171 | 101.996491 | 1.326587 | 3.824006 |
std | 2.114073 | 23.475248 | 15.516596 | 72.285227 | 23.385518 | 16.678031 | 136.246444 | 67.329092 | 2.030854 | 0.989998 |
min | 0.000000 | 1884.000000 | -48.261700 | -123.035700 | 5.000000 | 5.000000 | 4.000000 | 13.100000 | 0.000000 | 0.800000 |
25% | 0.000000 | 1989.000000 | 35.031050 | -84.552200 | 40.000000 | 37.300000 | 44.000000 | 51.800000 | 0.000000 | 3.400000 |
50% | 0.000000 | 2000.000000 | 40.289800 | -76.653600 | 50.000000 | 49.700000 | 79.000000 | 91.200000 | 0.000000 | 4.000000 |
75% | 3.000000 | 2010.000000 | 44.799600 | 2.778100 | 63.000000 | 58.000000 | 113.000000 | 131.200000 | 2.000000 | 4.500000 |
max | 14.000000 | 2022.000000 | 63.230900 | 153.426500 | 240.000000 | 149.100000 | 3937.000000 | 377.300000 | 14.000000 | 12.000000 |
4. Rename the following columns:
- coaster_name -> Coaster_Name
- year_introduced -> Year_Introduced
- opening_date_clean -> Opening_Date
- speed_mph -> Speed_mph
- height_ft -> Height_ft
- Inversions_clean -> Inversions
- Gforce_clean -> Gforce
= {"coaster_name":"Coaster_Name",
new_column_names "year_introduced":"Year_Introduced",
"opening_date_clean":"Opening_Date",
"speed_mph":"Speed_mph",
"height_ft":"Height_ft",
"Inversions_clean":"Inversions",
"Gforce_clean":"Gforce"
}=new_column_names, inplace=True) df.rename(columns
df.columns
Index(['Coaster_Name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
'Track layout', 'Fastrack available', 'Soft opening date.1',
'Closing date', 'Opened', 'Replaced by', 'Website',
'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
'Single rider line available', 'Restraint Style',
'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
'Year_Introduced', 'latitude', 'longitude', 'Type_Main', 'Opening_Date',
'speed1', 'speed2', 'speed1_value', 'speed1_unit', 'Speed_mph',
'height_value', 'height_unit', 'Height_ft', 'Inversions', 'Gforce'],
dtype='object')
5. Are there any duplicated rows?
== True] df[df.duplicated()
Coaster_Name | Length | Speed | Location | Status | Opening date | Type | Manufacturer | Height restriction | Model | ... | speed1 | speed2 | speed1_value | speed1_unit | Speed_mph | height_value | height_unit | Height_ft | Inversions | Gforce |
---|
0 rows × 56 columns
6. What are the top 3 years with the most roller coasters introduced?
"Year_Introduced"].value_counts(ascending=False).iloc[0:3] df[
1999 49
2000 47
1998 32
Name: Year_Introduced, dtype: int64
7. What is the average speed? Also display a plot to show it’s distribution.
= df["Speed"]
speed = speed.str.extract('(\d+\.?\d+)\s?km/h',expand=False).astype("float64").dropna()
speed = np.mean(np.array(speed), dtype=float)
speed_in_km_average print(f"The average speed in km/h is {np.round(speed_in_km_average,2)}")
The average speed in km/h is 78.13
= plt.subplots(figsize=(10, 5))
fig, ax
= sns.histplot(speed)
ax "Distribution of Roller Coasters speed in km/h")
ax.set_title("Roller Coasters")
ax.set_xlabel("Speed frequencies")
ax.set_ylabel( plt.show()
9. Create your own question and answer it. Find out which roller coasters are still opened and calculate number of years since opening and visulize that.
## Filter only opend roller coasters
= df[df["Status"] == "Operating"]
open_roller_coasters = datetime.now().year
currentYear = currentYear - open_roller_coasters["Year_Introduced"]
years_opened years_opened
6 121
13 113
15 111
22 103
23 103
...
1066 2
1068 2
1069 2
1070 2
1071 2
Name: Year_Introduced, Length: 668, dtype: int64
= plt.subplots(figsize=(10, 5))
fig, ax = sns.histplot(years_opened)
ax "Years opened")
ax.set_xlabel("Frequency")
ax.set_ylabel("Distribution of years opened for still operating roller coasters")
ax.set_title( plt.show()