-
Notifications
You must be signed in to change notification settings - Fork 0
/
Musical data visualization.R
155 lines (129 loc) · 5.26 KB
/
Musical data visualization.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Load libraries
library(readr)
library(dplyr)
library(readr)
library(ggplot2)
library(viridis)
library(wordcloud2)
# Load the dataset from GitHub
urlfile <- "https://raw.githubusercontent.com/dateascientist/wrangling-musical-data/master/datasets/bb_chords.csv"
mydata <- read.csv(url(urlfile), stringsAsFactors=T)
# Understand the data
head(mydata)
sum(is.na(mydata))
summary(mydata)
str(mydata)
# Identify the most common chords and their frequency
# Counting the most common chords
chord_count <- mydata %>%
count(chord,sort = TRUE)
# Display the top 20 chords
chord_count[1:20,]
# Visualize most common chords
# Creating a bar plot from the data
chord_count_fraction <- chord_count %>%
slice(1:20) %>%
mutate(fraction = n / sum(n),
chord = reorder(chord, fraction))
ggplot(chord_count_fraction, aes(x=chord, y=fraction, fill = chord)) +
geom_bar(stat = "identity") +
theme(legend.position='none')+
ylab("Fraction of total chords") +
xlab("Chords") +
coord_flip() +
scale_fill_viridis(discrete = T, option="H") # Apply the Viridis gradient
# Change of top 5 most common chords over the year
selected_chords_over_time <- mydata %>%
filter(chord %in% chord_count[c(1:5),]$chord) %>%
group_by(chord, year) %>%
count(chord) %>%
arrange(chord, desc(n), desc(year))
ggplot(selected_chords_over_time, aes(x = year, y = n, fill=chord)) +
geom_bar(position="fill", stat = "identity") + # Create stacked bars
scale_y_continuous(labels = scales::percent) + # Format y-axis as percentages
scale_fill_viridis(discrete = TRUE, option = "H") # Apply viridis to fill
# Chord "bigrams"
# Wrangling and counting bigrams
mydata_2 <- mydata %>%
mutate(next_chord = lead(chord),
next_title = lead(title)) %>%
filter(title == next_title) %>% # Filter rows where title equals next_title
mutate(bigram = paste(chord, next_chord, sep=' '))
chord_bigram_count <- mydata_2 %>%
count(bigram, sort=TRUE)
# Displaying the top 20 bigrams
chord_bigram_count[1:20,]
# Visualizing the most common chord progressions
bigram_count_fraction <- chord_bigram_count %>%
slice(1:20) %>%
mutate(fraction = n / sum(n),
bigram = reorder(bigram, fraction))
ggplot(bigram_count_fraction, aes(x=bigram, y=fraction, fill = bigram)) +
geom_bar(stat = "identity") +
theme(legend.position='none')+
ylab("Fraction of total bigrams") +
xlab("Bigrams") +
coord_flip() +
scale_fill_viridis(discrete = T, option="H")
# Change of top 5 most common chord bigrams over the year
selected_bigrams_over_time <- mydata_2 %>%
filter(bigram %in% chord_bigram_count[c(1:5),]$bigram) %>%
group_by(bigram, year) %>%
count(bigram) %>%
arrange(bigram, desc(n), desc(year))
ggplot(selected_bigrams_over_time, aes(x = year, y = n, fill=bigram)) +
geom_bar(position="fill", stat = "identity") + # Create stacked bars
scale_y_continuous(labels = scales::percent) + # Format y-axis as percentages
scale_fill_viridis(discrete = TRUE, option = "H") # Apply viridis to fill
# Find most common artists
# Find 30 artists with the most songs in the corpus
top_30_artists <- mydata %>%
select(artist, title) %>%
unique() %>%
count(artist,sort=TRUE) %>%
slice(1:30)
# Display 30 artists with the most songs in the corpus
top_30_artists
set.seed(999)
wordcloud2(data=top_30_artists, size=0.3, color= viridis(5), shape='star')
# Tag the corpus
tags <- tibble(
artist = c('Abba', 'Billy Joel', 'Elton John', 'Stevie Wonder', 'The Rolling Stones', 'The Beatles', 'Eric Clapton'),
instrument = c('piano', 'piano', 'piano', 'piano', 'guitar', 'guitar', 'guitar'))
# Create a new dataframe `bb_tagged` that includes a new column `instrument` from `tags`
mydata_tagged <- mydata %>%
inner_join(tags) #this will remove rows without empty "tags"
# Display the new dataframe
mydata_tagged
# Compare chords in piano-driven and guitar-driven songs
# The top 20 most common chords
top_20_chords <- chord_count[1:20,]
# Comparing the frequency of the 20 most common chords in piano- and guitar-driven songs
tagged_chord_data <- mydata_tagged %>%
filter(chord %in% top_20_chords$chord) %>%
count(chord, instrument, sort = TRUE)
tagged_chord_data
ggplot(tagged_chord_data, aes(x = chord, y = n, fill=chord))+
geom_bar(stat = 'identity')+
coord_flip() +
xlab("Chords") +
ylab("Number of Chords") +
scale_fill_viridis(discrete = T, option="H") +
theme(legend.position="none") +
facet_grid(~instrument)
# Compare chord bigrams in piano-driven and guitar-driven songs
# Compare the frequency of the 20 most common chord bigrams in piano- and guitar-driven songs
mydata_2_tagged <- mydata_2 %>%
inner_join(tags)
tagged_bigram_data <- mydata_2_tagged %>%
filter(bigram %in% chord_bigram_count[1:20,]$bigram) %>%
count(bigram, instrument, sort = TRUE)
tagged_bigram_data
ggplot(tagged_bigram_data, aes(x = bigram, y = n, fill=bigram))+
geom_bar(stat = 'identity')+
coord_flip() +
xlab("Bigrams") +
ylab("Number of Bigrams") +
scale_fill_viridis(discrete = T, option="H") +
theme(legend.position="none") +
facet_grid(~instrument)