Modelling hip hop songs dataset using PCA

An example of PCA based modelling approach

This dataset was publisged in tidy tuesday a few days back, The data contains details of hiphop songs and their ratings given by few independent critics. Some other details about the critics are also given such as gender. Details such as year, title, artist has been given about the songs.

Lets load the data

rankings <- read_csv("")
Explore the data

rankings %>% 
    ggplot(aes(year, points, color = gender)) +
    geom_jitter(alpha = 0.7) +


rankings %>% 
    count(gender, wt = points, sort = T)
Setting up spotify developer account to access API.

access_token <- spotifyr::get_spotify_access_token()

Fetching hiphop songs playlist with features

playlist_features <-spotifyr::get_playlist_audio_features("tmock1923", "7esD007S7kzeSwVtcH9GFe")

rankings <- rankings %>% 
  mutate(search_term = paste(title, artist)) %>% 
  mutate(search_term = str_to_lower(search_term)) %>% 
  mutate(search_term = str_remove(search_term, "ft.*$"))

spotify_search <- function(query){
  spotifyr::search_spotify(query, type = 'track') %>% 
  filter(popularity == max(popularity)) %>% 

spotify_search('Dear Mama')
ranking_ids <- rankings %>% 
  mutate(id = map(search_term, possibly(spotify_search, NA_character_))) %>% 

ranking_ids %>% 
  na.omit() %>% 
  count(, wt = n)
ranking_features <- ranking_ids %>% 
  mutate(id_group = row_number() %/% 80) %>% 
  select(id, id_group) %>% 
  nest(data = c('id')) %>% 
  mutate(audio_features = map(data, ~spotifyr::get_track_audio_features(.$id))) %>% 
  unnest(data, audio_features)
ranking_df <- ranking_ids %>%
  left_join(ranking_features) %>% 
  select(title, artist, points, year,
         danceability:tempo) %>% 
ranking_df %>%
  select(year:tempo) %>% 
  correlate() %>% 
  rearrange() %>% 
  shave %>% 
Lets use tidymodels

ranking_rec <- recipe(points ~ ., data = ranking_df) %>%
  update_role(title, artist, new_role = "id") %>%
  step_log(points) %>%
  step_normalize(all_predictors()) %>%

ranking_prep <- prep(ranking_rec)
tidied_pca <- tidy(ranking_prep,3)

tidied_pca %>% 
  mutate(component = fct_inorder(component)) %>% 
  ggplot(aes(value, terms, fill = terms)) +
  geom_col(show.legend = F) +
  facet_wrap(~component) +
  labs(y = NULL)

tidied_pca %>% 
  filter(component %in% c("PC1", "PC2", "PC3", "PC4", "PC5", "PC6")) %>%
  group_by(component) %>% 
  top_n(6, abs(value)) %>% 
  ungroup() %>% 
  mutate(terms = reorder_within(terms, abs(value), component)) %>% 
  ggplot(aes(value, terms, fill = value>0), alpha = 0.2) +
  geom_col() +
  facet_wrap(~component, scales = "free_y") +
  scale_y_reordered() +
  labs(x = "Absolute Value of Contribution",
       y = NULL,
       fill = "Positive?")

juice(ranking_prep) %>% 
  ggplot(aes(PC1, PC2)) +
  geom_point(alpha = 0.) +
  geom_text(aes(label = title), check_overlap = T)

sdev <- ranking_prep$steps[[3]]$res$sdev

percent_variation <- sdev^2 / sum(sdev^2)

tibble(component = unique(tidied_pca$component),
       percent_var = percent_variation) %>% 
  mutate(component = fct_reorder(component, -percent_var)) %>% 
  ggplot(aes(component, percent_var)) +

Lets do a linear regression now on PCA

pca_lm <- juice(ranking_prep) %>% 
  select(-artist, -title) %>% 
  lm(points ~ ., data = .)

## Call:
## lm(formula = points ~ ., data = .)
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.52232 -0.58476  0.02525  0.39202  2.86961 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.93004    0.04837  39.905   <2e-16 ***
## PC1          0.07391    0.03282   2.252    0.025 *  
## PC2         -0.04769    0.03708  -1.286    0.199    
## PC3         -0.05605    0.04121  -1.360    0.175    
## PC4          0.01458    0.04612   0.316    0.752    
## PC5          0.01326    0.04678   0.283    0.777    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Residual standard error: 0.8419 on 297 degrees of freedom
## Multiple R-squared:  0.02864,    Adjusted R-squared:  0.01229 
## F-statistic: 1.751 on 5 and 297 DF,  p-value: 0.1228

That’s a small exercise to see how we can leverage the functionalities of PCA in our model

