Data Analysis Challenge 06 - Roller Coaster Analysis

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime

url= "https://raw.githubusercontent.com/kedeisha1/Challenges/main/coaster_db.csv"
df = pd.read_csv(url)

df.head(10)

	coaster_name	Length	Speed	Location	Status	Opening date	Type	Manufacturer	Height restriction	Model	...	speed1	speed2	speed1_value	speed1_unit	speed_mph	height_value	height_unit	height_ft	Inversions_clean	Gforce_clean
0	Switchback Railway	600 ft (180 m)	6 mph (9.7 km/h)	Coney Island	Removed	June 16, 1884	Wood	LaMarcus Adna Thompson	NaN	Lift Packed	...	6 mph	9.7 km/h	6.0	mph	6.0	50.0	ft	NaN	0	2.9
1	Flip Flap Railway	NaN	NaN	Sea Lion Park	Removed	1895	Wood	Lina Beecher	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	12.0
2	Switchback Railway (Euclid Beach Park)	NaN	NaN	Cleveland, Ohio, United States	Closed	NaN	Other	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN
3	Loop the Loop (Coney Island)	NaN	NaN	Other	Removed	1901	Steel	Edwin Prescott	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	NaN
4	Loop the Loop (Young's Pier)	NaN	NaN	Other	Removed	1901	Steel	Edwin Prescott	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	1	NaN
5	Cannon Coaster	NaN	NaN	Coney Island	Removed	1902	Wood	George Francis Meyer	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	40.0	ft	NaN	0	NaN
6	Leap-The-Dips	1,452 ft (443 m)	10 mph (16 km/h)	Lakemont Park	Operating	1902	Wood – Side friction	Federal Construction Company	NaN	NaN	...	10 mph	16 km/h	10.0	mph	10.0	41.0	ft	NaN	0	NaN
7	Figure Eight (Euclid Beach Park)	NaN	NaN	Cleveland, Ohio, United States	Closed	NaN	Other	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN
8	Drop the Dip	NaN	NaN	Coney Island	Removed	June 6, 1907	Other	Arthur Jarvis	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	60.0	ft	NaN	0	NaN
9	Scenic Railway (Euclid Beach Park)	NaN	NaN	Cleveland, Ohio, United States	Closed	NaN	Other	NaN	NaN	NaN	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	0	NaN

10 rows × 56 columns

1. How many columns and rows are in the dataset?

shape = df.shape
print(f"Dataframe has {shape[0]} rows and {shape[1]} columns")

Dataframe has 1087 rows and 56 columns

2. Is there any missing data?

df.isna()

	coaster_name	Length	Speed	Location	Status	Opening date	Type	Manufacturer	Height restriction	Model	...	speed1	speed2	speed1_value	speed1_unit	speed_mph	height_value	height_unit	height_ft	Inversions_clean	Gforce_clean
0	False	False	False	False	False	False	False	False	True	False	...	False	False	False	False	False	False	False	True	False	False
1	False	True	True	False	False	False	False	False	True	True	...	True	True	True	True	True	True	True	True	False	False
2	False	True	True	False	False	True	False	True	True	True	...	True	True	True	True	True	True	True	True	False	True
3	False	True	True	False	False	False	False	False	True	True	...	True	True	True	True	True	True	True	True	False	True
4	False	True	True	False	False	False	False	False	True	True	...	True	True	True	True	True	True	True	True	False	True
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
1082	False	False	False	False	True	True	False	False	False	True	...	False	False	False	False	False	False	False	True	False	False
1083	False	False	False	False	False	False	False	False	True	False	...	False	False	False	False	False	False	False	True	False	True
1084	False	False	False	False	True	False	False	False	False	False	...	False	False	False	False	False	False	False	True	False	False
1085	False	False	False	False	False	True	False	False	True	False	...	False	False	False	False	False	False	False	True	False	True
1086	False	False	False	False	False	False	False	False	True	False	...	False	False	False	False	False	False	False	True	False	True

1087 rows × 56 columns

3. Display the summary statistics of the numeric columns using the describe method.

df.describe()

	Inversions	year_introduced	latitude	longitude	speed1_value	speed_mph	height_value	height_ft	Inversions_clean	Gforce_clean
count	932.000000	1087.000000	812.000000	812.000000	937.000000	937.000000	965.000000	171.000000	1087.000000	362.000000
mean	1.547210	1994.986201	38.373484	-41.595373	53.850374	48.617289	89.575171	101.996491	1.326587	3.824006
std	2.114073	23.475248	15.516596	72.285227	23.385518	16.678031	136.246444	67.329092	2.030854	0.989998
min	0.000000	1884.000000	-48.261700	-123.035700	5.000000	5.000000	4.000000	13.100000	0.000000	0.800000
25%	0.000000	1989.000000	35.031050	-84.552200	40.000000	37.300000	44.000000	51.800000	0.000000	3.400000
50%	0.000000	2000.000000	40.289800	-76.653600	50.000000	49.700000	79.000000	91.200000	0.000000	4.000000
75%	3.000000	2010.000000	44.799600	2.778100	63.000000	58.000000	113.000000	131.200000	2.000000	4.500000
max	14.000000	2022.000000	63.230900	153.426500	240.000000	149.100000	3937.000000	377.300000	14.000000	12.000000

4. Rename the following columns:

coaster_name -> Coaster_Name
year_introduced -> Year_Introduced
opening_date_clean -> Opening_Date
speed_mph -> Speed_mph
height_ft -> Height_ft
Inversions_clean -> Inversions
Gforce_clean -> Gforce

new_column_names = {"coaster_name":"Coaster_Name", 
 "year_introduced":"Year_Introduced",
 "opening_date_clean":"Opening_Date", 
 "speed_mph":"Speed_mph",
 "height_ft":"Height_ft", 
 "Inversions_clean":"Inversions",
 "Gforce_clean":"Gforce"
}
df.rename(columns =new_column_names, inplace=True)

df.columns

Index(['Coaster_Name', 'Length', 'Speed', 'Location', 'Status', 'Opening date',
       'Type', 'Manufacturer', 'Height restriction', 'Model', 'Height',
       'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
       'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
       'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
       'Track layout', 'Fastrack available', 'Soft opening date.1',
       'Closing date', 'Opened', 'Replaced by', 'Website',
       'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
       'Single rider line available', 'Restraint Style',
       'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'Year_Introduced', 'latitude', 'longitude', 'Type_Main', 'Opening_Date',
       'speed1', 'speed2', 'speed1_value', 'speed1_unit', 'Speed_mph',
       'height_value', 'height_unit', 'Height_ft', 'Inversions', 'Gforce'],
      dtype='object')

5. Are there any duplicated rows?

df[df.duplicated() == True]

	Coaster_Name	Length	Speed	Location	Status	Opening date	Type	Manufacturer	Height restriction	Model	...	speed1	speed2	speed1_value	speed1_unit	Speed_mph	height_value	height_unit	Height_ft	Inversions	Gforce

0 rows × 56 columns

6. What are the top 3 years with the most roller coasters introduced?

df["Year_Introduced"].value_counts(ascending=False).iloc[0:3]

1999    49
2000    47
1998    32
Name: Year_Introduced, dtype: int64

7. What is the average speed? Also display a plot to show it’s distribution.

speed = df["Speed"]
speed = speed.str.extract('(\d+\.?\d+)\s?km/h',expand=False).astype("float64").dropna()
speed_in_km_average = np.mean(np.array(speed), dtype=float)
print(f"The average speed in km/h is {np.round(speed_in_km_average,2)}")

The average speed in km/h is 78.13

fig, ax = plt.subplots(figsize=(10, 5))

ax = sns.histplot(speed)
ax.set_title("Distribution of Roller Coasters speed in km/h")
ax.set_xlabel("Roller Coasters")
ax.set_ylabel("Speed frequencies")
plt.show()

Figure 1: Distribution of Roller Coasters speed in km/h

8. Explore the feature relationships. Are there any positively or negatively correlated relationships?

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1087 entries, 0 to 1086
Data columns (total 56 columns):
 #   Column                         Non-Null Count  Dtype  
---  ------                         --------------  -----  
 0   Coaster_Name                   1087 non-null   object 
 1   Length                         953 non-null    object 
 2   Speed                          937 non-null    object 
 3   Location                       1087 non-null   object 
 4   Status                         874 non-null    object 
 5   Opening date                   837 non-null    object 
 6   Type                           1087 non-null   object 
 7   Manufacturer                   1028 non-null   object 
 8   Height restriction             831 non-null    object 
 9   Model                          744 non-null    object 
 10  Height                         965 non-null    object 
 11  Inversions                     932 non-null    float64
 12  Lift/launch system             795 non-null    object 
 13  Cost                           382 non-null    object 
 14  Trains                         718 non-null    object 
 15  Park section                   487 non-null    object 
 16  Duration                       765 non-null    object 
 17  Capacity                       575 non-null    object 
 18  G-force                        362 non-null    object 
 19  Designer                       578 non-null    object 
 20  Max vertical angle             357 non-null    object 
 21  Drop                           494 non-null    object 
 22  Soft opening date              96 non-null     object 
 23  Fast Lane available            69 non-null     object 
 24  Replaced                       173 non-null    object 
 25  Track layout                   335 non-null    object 
 26  Fastrack available             19 non-null     object 
 27  Soft opening date.1            96 non-null     object 
 28  Closing date                   236 non-null    object 
 29  Opened                         27 non-null     object 
 30  Replaced by                    88 non-null     object 
 31  Website                        87 non-null     object 
 32  Flash Pass Available           50 non-null     object 
 33  Must transfer from wheelchair  106 non-null    object 
 34  Theme                          44 non-null     object 
 35  Single rider line available    81 non-null     object 
 36  Restraint Style                22 non-null     object 
 37  Flash Pass available           46 non-null     object 
 38  Acceleration                   60 non-null     object 
 39  Restraints                     24 non-null     object 
 40  Name                           35 non-null     object 
 41  Year_Introduced                1087 non-null   int64  
 42  latitude                       812 non-null    float64
 43  longitude                      812 non-null    float64
 44  Type_Main                      1087 non-null   object 
 45  Opening_Date                   837 non-null    object 
 46  speed1                         937 non-null    object 
 47  speed2                         935 non-null    object 
 48  speed1_value                   937 non-null    float64
 49  speed1_unit                    937 non-null    object 
 50  Speed_mph                      937 non-null    float64
 51  height_value                   965 non-null    float64
 52  height_unit                    965 non-null    object 
 53  Height_ft                      171 non-null    float64
 54  Inversions                     1087 non-null   int64  
 55  Gforce                         362 non-null    float64
dtypes: float64(8), int64(2), object(46)
memory usage: 475.7+ KB

## Cleaning dataset for correlation matrix
df["Trains"] = df["Trains"].str.replace("single","1",case=False).str.extract('(\d+)\s*trains?')
df["Max vertical angle"] = df["Max vertical angle"].str.extract('(\d+\.?\d+)')
df["Capacity"] = df["Capacity"].str.replace(",","").str.extract('(\d\,?\d+)')
df["speed2"] = df["speed2"].str.extract('(\d+\.?\d+)')
for col in ["Length","Height", "Height restriction", "Drop"]:
    df[col] = df[col].str.replace(",","").str.extract('(?<=\()(\d+\.?\d+)')

## Select only numeric values
columns = ["Length", "Height restriction", "Height", "Inversions", "Trains",  "Capacity", "Gforce", "Max vertical angle",
          "Drop", "Year_Introduced", "speed2"]
corr_data = df.loc[:, columns]
corr_data = corr_data.astype(float).fillna(0)
corr_data = corr_data.loc[:,~corr_data.columns.duplicated()]

corr_data

	Length	Height restriction	Height	Inversions	Trains	Capacity	Gforce	Max vertical angle	Drop	Year_Introduced	speed2
0	180.0	0.0	15.0	0.0	0.0	1600.0	2.9	30.0	13.0	1884.0	9.7
1	0.0	0.0	0.0	1.0	0.0	0.0	12.0	0.0	0.0	1895.0	0.0
2	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	0.0	1896.0	0.0
3	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1901.0	0.0
4	0.0	0.0	0.0	1.0	0.0	0.0	0.0	0.0	0.0	1901.0	0.0
...	...	...	...	...	...	...	...	...	...	...	...
1082	1050.0	140.0	34.0	3.0	3.0	1500.0	4.7	0.0	0.0	2022.0	85.0
1083	1014.0	0.0	54.0	2.0	2.0	0.0	0.0	95.0	0.0	2022.0	117.0
1084	966.0	122.0	23.8	0.0	7.0	1680.0	4.0	0.0	0.0	2022.0	95.4
1085	230.0	0.0	34.0	0.0	0.0	0.0	0.0	0.0	0.0	2022.0	55.0
1086	1000.0	0.0	40.0	3.0	0.0	0.0	0.0	87.0	39.0	2022.0	93.0

1087 rows × 11 columns

fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.heatmap(corr_data.corr(),annot=True)
plt.show()

9. Create your own question and answer it. Find out which roller coasters are still opened and calculate number of years since opening and visulize that.

## Filter only opend roller coasters 
open_roller_coasters = df[df["Status"] == "Operating"]
currentYear = datetime.now().year
years_opened = currentYear - open_roller_coasters["Year_Introduced"]
years_opened

6       121
13      113
15      111
22      103
23      103
       ... 
1066      2
1068      2
1069      2
1070      2
1071      2
Name: Year_Introduced, Length: 668, dtype: int64

fig, ax = plt.subplots(figsize=(10, 5))
ax = sns.histplot(years_opened)
ax.set_xlabel("Years opened")
ax.set_ylabel("Frequency")
ax.set_title("Distribution of years opened for still operating roller coasters")
plt.show()