-
Notifications
You must be signed in to change notification settings - Fork 66
/
Copy pathget_match_results.R
101 lines (76 loc) · 3.67 KB
/
get_match_results.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
#' Get match results
#'
#' Returns the game results for a given league season(s)
#'
#' @param country the three character country code
#' @param gender gender of competition, either "M" or "F"
#' @param season_end_year the year(s) the season concludes
#'
#' @return returns a dataframe with the results of the competition, season and gender
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#' @importFrom utils read.csv
#'
#' @export
#'
#' @examples
#' \dontrun{
#' get_match_results(country = c("ENG", "ITA", "GER"), gender = "M", season_end_year = c(2018:2021))
#' }
get_match_results <- function(country, gender, season_end_year) {
cat("Scraping match results")
country_abbr <- country
gender_M_F <- gender
season_end_year_num <- season_end_year
main_url <- "https://fbref.com"
seasons <- read.csv("https://raw.githubusercontent.com/JaseZiv/worldfootballR_data/master/raw-data/league_seasons/all_tier1_season_URLs.csv")
seasons <- seasons %>%
dplyr::filter(country %in% country_abbr,
gender %in% gender_M_F,
season_end_year %in% season_end_year_num)
seasons_urls <- seasons %>%
dplyr::pull(seasons_urls)
get_each_season_results <- function(season_url) {
fixtures_url <- xml2::read_html(season_url) %>%
rvest::html_nodes(".hoversmooth") %>%
rvest::html_nodes(".full") %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>% .[grepl("Fixtures", .)] %>% paste0(main_url, .)
fixtures_page <- xml2::read_html(fixtures_url)
season_name <- fixtures_page %>% rvest::html_nodes("h2 span") %>% rvest::html_text() %>% .[1]
season_summary <- fixtures_page %>%
rvest::html_table() %>% .[1] %>% data.frame() %>%
dplyr::filter(.data$Date != "")
suppressWarnings(
season_summary <- season_summary %>%
tidyr::separate(.data$Score, into = c("HomeGoals", "AwayGoals"), sep = "–") %>%
dplyr::mutate(HomeGoals = as.numeric(.data$HomeGoals),
AwayGoals = as.numeric(.data$AwayGoals),
Attendance = as.numeric(gsub(",", "", .data$Attendance)))
)
season_summary <- cbind(season_url, season_name, season_summary)
if(!any(stringr::str_detect(names(season_summary), "Round"))) {
Round <- rep(NA, nrow(season_summary))
season_summary <- cbind(Round, season_summary)
}
if(any(stringr::str_detect(names(season_summary), "xG"))) {
season_summary <- season_summary %>%
dplyr::select(.data$season_url, Season_Name=.data$season_name, Round, .data$Wk, .data$Day, .data$Date, .data$Time, .data$Home, .data$HomeGoals, Home_xG=.data$xG, .data$Away, .data$AwayGoals, Away_xG=.data$xG.1, .data$Attendance, .data$Venue, .data$Referee, .data$Notes)
} else {
season_summary <- season_summary %>%
dplyr::select(.data$season_url, Season_Name=.data$season_name, Round, .data$Wk, .data$Day, .data$Date, .data$Time, .data$Home, .data$HomeGoals, .data$Away, .data$AwayGoals, .data$Attendance, .data$Venue, .data$Referee, .data$Notes)
}
return(season_summary)
}
all_results <- seasons_urls %>%
purrr::map_df(get_each_season_results)
all_results <- seasons %>%
dplyr::select(Competition_Name=.data$competition_name, Gender=.data$gender, Country=.data$country, Season_End_Year=.data$season_end_year, .data$seasons_urls) %>%
dplyr::left_join(all_results, by = c("seasons_urls" = "season_url")) %>%
dplyr::select(-.data$seasons_urls) %>%
dplyr::mutate(Date = lubridate::ymd(.data$Date)) %>%
dplyr::arrange(.data$Country, .data$Competition_Name, .data$Gender, .data$Season_End_Year, .data$Wk, .data$Date, .data$Time)
cat("Match results finished scraping")
return(all_results)
}