Thanks - good basic exploratory data workout. Did last one in M, so doing this one in R:
Click for R Script
# BRIAN JULIUS SOLUTION TO eDNA DATA ANALYSIS WORKOUT 003
library(tidyverse)
BabyNames <- read_csv("https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/US_Baby_Names/US_Baby_Names_right.csv")
#Q1 - See the first 10 entries.
head(BabyNames, 10)
#Q2 - Delete the columns ‘Unnamed: 0’ and ‘Id’.
BabyNames <- BabyNames %>% select(-c(...1, Id))
head( BabyNames, 10 )
#Q3 - Group the dataset by name, assign to a variable called names, and sort the dataset by highest to lowest count.
Names <- BabyNames %>%
group_by(Name) %>%
summarise(TotalCount = sum(Count)) %>%
arrange(desc(TotalCount))
head(Names, 10)
#Q4 - How many different names exist in the dataset?
Names %>% summarise(distinct_names = n_distinct(Name))
#Q5 - What is the name with most occurrences?
Names %>% head(1)
#Q6 - What is the standard deviation of count of names?
Names %>% summarise(std_dev_Count = sd(TotalCount))
#Q7 - Get a summary of the dataset with the mean, min, max, std and quartiles.
summary_stats <- Names %>%
summarise(
across(
TotalCount,
list(
mean = mean,
min = min,
max = max,
std = sd,
Q1 = ~quantile(., 0.25),
Q3 = ~quantile(., 0.75)
)
)
)
options(width = 200)
print(summary_stats)
Click for Output
> # BRIAN JULIUS SOLUTION TO eDNA DATA ANALYSIS WORKOUT 003
>
> library(tidyverse)
> BabyNames <- read_csv("https://raw.githubusercontent.com/guipsamora/pandas_exercises/master/06_Stats/US_Baby_Names/US_Baby_Names_right.csv")
New names:
• `` -> `...1`
Rows: 1016395 Columns: 7
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (3): Name, Gender, State
dbl (4): ...1, Id, Year, Count
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
>
> #Q1 - See the first 10 entries.
> head(BabyNames, 10)
# A tibble: 10 × 7
...1 Id Name Year Gender State Count
<dbl> <dbl> <chr> <dbl> <chr> <chr> <dbl>
1 11349 11350 Emma 2004 F AK 62
2 11350 11351 Madison 2004 F AK 48
3 11351 11352 Hannah 2004 F AK 46
4 11352 11353 Grace 2004 F AK 44
5 11353 11354 Emily 2004 F AK 41
6 11354 11355 Abigail 2004 F AK 37
7 11355 11356 Olivia 2004 F AK 33
8 11356 11357 Isabella 2004 F AK 30
9 11357 11358 Alyssa 2004 F AK 29
10 11358 11359 Sophia 2004 F AK 28
>
> #Q2 - Delete the columns ‘Unnamed: 0’ and ‘Id’.
> BabyNames <- BabyNames %>% select(-c(...1, Id))
> head( BabyNames, 10 )
# A tibble: 10 × 5
Name Year Gender State Count
<chr> <dbl> <chr> <chr> <dbl>
1 Emma 2004 F AK 62
2 Madison 2004 F AK 48
3 Hannah 2004 F AK 46
4 Grace 2004 F AK 44
5 Emily 2004 F AK 41
6 Abigail 2004 F AK 37
7 Olivia 2004 F AK 33
8 Isabella 2004 F AK 30
9 Alyssa 2004 F AK 29
10 Sophia 2004 F AK 28
>
> #Q3 - Group the dataset by name, assign to a variable called names, and sort the dataset by highest to lowest count.
> Names <- BabyNames %>%
+ group_by(Name) %>%
+ summarise(TotalCount = sum(Count)) %>%
+ arrange(desc(TotalCount))
> head(Names, 10)
# A tibble: 10 × 2
Name TotalCount
<chr> <dbl>
1 Jacob 242874
2 Emma 214852
3 Michael 214405
4 Ethan 209277
5 Isabella 204798
6 William 197894
7 Joshua 191551
8 Sophia 191446
9 Daniel 191440
10 Emily 190318
>
> #Q4 - How many different names exist in the dataset?
> Names %>% summarise(distinct_names = n_distinct(Name))
# A tibble: 1 × 1
distinct_names
<int>
1 17632
>
> #Q5 - What is the name with most occurrences?
> Names %>% head(1)
# A tibble: 1 × 2
Name TotalCount
<chr> <dbl>
1 Jacob 242874
>
> #Q6 - What is the standard deviation of count of names?
> Names %>% summarise(std_dev_Count = sd(TotalCount))
# A tibble: 1 × 1
std_dev_Count
<dbl>
1 11006.
>
> #Q7 - Get a summary of the dataset with the mean, min, max, std and quartiles.
> summary_stats <- Names %>%
+ summarise(
+ across(
+ TotalCount,
+ list(
+ mean = mean,
+ min = min,
+ max = max,
+ std = sd,
+ Q1 = ~quantile(., 0.25),
+ Q3 = ~quantile(., 0.75)
+ )
+ )
+ )
> options(width = 200)
> print(summary_stats)
# A tibble: 1 × 6
TotalCount_mean TotalCount_min TotalCount_max TotalCount_std TotalCount_Q1 TotalCount_Q3
<dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
1 2009. 5 242874 11006. 11 337
Group the dataset by name, assign to a variable called names, and sort the dataset by highest to lowest count.
# you don't want to sum the Year column, so you delete it
baby_names.drop(["Year"], axis=1, inplace = True)
# group the data
names = baby_names.groupby("Name").sum()
names.sort_values("Count", ascending = False)
How many different names exist in the dataset?
len(names)
What is the name with most occurrences?
names.Count.idxmax()
What is the standard deviation of count of names?
names.Count.std()
Get a summary of the dataset with the mean, min, max, std and quartiles.