Skip to content

Commit

Permalink
first commit
Browse files Browse the repository at this point in the history
  • Loading branch information
JaseZiv committed Jan 1, 2021
1 parent 27c0edd commit 9d3ee18
Show file tree
Hide file tree
Showing 16 changed files with 343 additions and 21 deletions.
Binary file added .DS_Store
Binary file not shown.
2 changes: 2 additions & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
^.*\.Rproj$
^\.Rproj\.user$
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata
31 changes: 31 additions & 0 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
Package: worldfootballR
Type: Package
Title: An R Package To Extract World Football (Soccer) data from fbref.com
Version: 0.0.0.9000
Authors@R: person("Jason Zivkovic", email = "jase.ziv83@gmail.com",
role = c("aut", "cre"))
URL: https://github.com/JaseZiv/worldfootballR
BugReports: https://github.com/JaseZiv/worldfootballR/issues
Description: This R package will enable users to extract a number of different game and player statistics and metrics from fbref.com.
License: GPL-3
Encoding: UTF-8
LazyData: true
Imports:
rlang,
dplyr,
purrr,
tidyr,
stringr,
lubridate,
magrittr,
scales,
xml2,
rvest,
curl,
rvest,
xml2
RoxygenNote: 7.1.0
Suggests: knitr,
rmarkdown,
ggplot2
VignetteBuilder: knitr
21 changes: 0 additions & 21 deletions LICENSE

This file was deleted.

6 changes: 6 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
# Generated by roxygen2: do not edit by hand

export(get_match_results)
export(get_match_urls)
importFrom(magrittr,"%>%")
importFrom(rlang,.data)
Binary file added R/.DS_Store
Binary file not shown.
74 changes: 74 additions & 0 deletions R/get_match_results.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
#' Get match results
#'
#' Returns the game results for a given league season
#'
#' @param country the three character country code
#' @param gender gender of competition, either "M" or "F"
#' @param season_end_year the year the season concludes, in quotes, ie "2021"
#'
#' @return returns a dataframe with the results of the competition, season and gender
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'
#' @export
#'
#' @examples
#' \dontrun{
#' get_match_results(country = "ITA", gender = "M", season_end_year = "2020")
#' }


get_match_results <- function(country, gender, season_end_year) {

print(paste0("Scraping ", country, " fist division for the ", season_end_year, " season (gender = ", gender, ")"))

competitions <- .get_tier1_competitions()

main_url <- "https://fbref.com"

country <- country
gender <- gender
season <- season_end_year
select_season <- .get_league_season_url(country, gender, season)


fixtures_url <- xml2::read_html(select_season) %>%
rvest::html_nodes(".hoversmooth") %>%
rvest::html_nodes(".full") %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>% .[grepl("Fixtures", .)] %>% paste0(main_url, .)


season_summary <- xml2::read_html(fixtures_url) %>%
rvest::html_table() %>% .[1] %>% data.frame() %>%
dplyr::filter(.data$Date != "")


suppressWarnings(
season_summary <- season_summary %>%
tidyr::separate(.data$Score, into = c("HomeGoals", "AwayGoals"), sep = "") %>%
dplyr::mutate(HomeGoals = as.numeric(.data$HomeGoals),
AwayGoals = as.numeric(.data$AwayGoals),
Attendance = as.numeric(gsub(",", "", .data$Attendance)))
)

season_summary <- cbind(season_end_year, season_summary)

if(!any(stringr::str_detect(names(season_summary), "Round"))) {
Round <- rep(NA, nrow(season_summary))
season_summary <- cbind(Round, season_summary)
}

if(any(stringr::str_detect(names(season_summary), "xG"))) {
season_summary <- season_summary %>%
dplyr::select(Season=season_end_year, Round, .data$Wk, .data$Day, .data$Date, .data$Time, .data$Home, .data$HomeGoals, Home_xG=.data$xG, .data$Away, .data$AwayGoals, Away_xG=.data$xG.1, .data$Attendance, .data$Venue, .data$Referee, .data$Notes)
} else {
season_summary <- season_summary %>%
dplyr::select(Season=season_end_year, Round, .data$Wk, .data$Day, .data$Date, .data$Time, .data$Home, .data$HomeGoals, .data$Away, .data$AwayGoals, .data$Attendance, .data$Venue, .data$Referee, .data$Notes)
}


return(season_summary)

}
44 changes: 44 additions & 0 deletions R/get_match_urls.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
#' Get match URLs
#'
#' Returns the URL for each match played for a given league season
#'
#' @param country the three character country code
#' @param gender gender of competition, either "M" or "F"
#' @param season the year the season concludes, in quotes, ie "2021"
#'
#' @return returns a character vector of all fbref match URLs for a given competition, season and gender
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'
#' @export
#'
#' @examples
#' \dontrun{
#' get_match_urls(country = "ENG", gender = "M", season_end_year = "2020")
#' }

get_match_urls <- function(country, gender, season) {
main_url <- "https://fbref.com"

country <- toupper(country)
gender <- toupper(gender)
season <- season

selected_season <- .get_league_season_url(country, gender, season)

fixtures_url <- xml2::read_html(selected_season) %>%
rvest::html_nodes(".hoversmooth") %>%
rvest::html_nodes(".full") %>%
rvest::html_nodes("a") %>%
rvest::html_attr("href") %>% .[grepl("Fixtures", .)] %>% paste0(main_url, .)

match_report_urls <- xml2::read_html(fixtures_url) %>%
# html_nodes(".left~ .left+ .left a") %>%
rvest::html_nodes("td.left~ .left+ .left a") %>%
rvest::html_attr("href") %>%
paste0(main_url, .) %>% unique()

return(match_report_urls)

}
2 changes: 2 additions & 0 deletions R/globals.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@

utils::globalVariables(c("."))
73 changes: 73 additions & 0 deletions R/internals.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
#' Get Tier 1 competitions
#'
#' Returns a df of the top leagues around the world
#'
#' @return returns a dataframe with the tier 1 competitions from around the world
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'

.get_tier1_competitions <- function() {
main_url <- "https://fbref.com"
# read page to all competitions
all_comps_url <- xml2::read_html("https://fbref.com/en/comps/")
# this just gets the Tier 1 club comps - this will need to be modified if more comps are required
comps <- all_comps_url %>% rvest::html_nodes("#all_comps_1_fa_club_league_senior")
# get the urls for each competition, then paste fbref url
competition_urls <- comps %>% rvest::html_node("tbody") %>% rvest::html_nodes("th a") %>% rvest::html_attr("href")
competition_urls <- paste0(main_url, competition_urls)
# scrape the table that contains the competitons
competitions <- comps %>% rvest::html_nodes(".sortable") %>% rvest::html_table() %>% data.frame()
# add the competition url column
competitions <- cbind(competitions, competition_urls)
# remove the two character country code for the flag, and only leave the 3 character code
competitions$Country <- gsub(".*? ", "", competitions$Country)

return(competitions)
}



#' Get URL of league season
#'
#' Returns a URL for the selected league seasons
#'
#' @param country the three character country code
#' @param gender gender of competition, either "M" or "F"
#' @param season the year the season concludes, in quotes, ie "2021"

#' @return a URL for the selected league seasons
#'
#' @importFrom magrittr %>%
#' @importFrom rlang .data
#'
.get_league_season_url <- function(country, gender, season) {
main_url <- "https://fbref.com"

competitions <- .get_tier1_competitions()

league_url <- competitions %>%
dplyr::filter(toupper(.data$Country) == toupper(country),
toupper(.data$Gender) == toupper(gender)) %>%
dplyr::pull(.data$competition_urls)

league_url <- xml2::read_html(league_url)

seasons <- league_url %>%
rvest::html_nodes("th a") %>%
rvest::html_text() %>%
gsub(".*-", "", .)

seasons_urls <- league_url %>%
rvest::html_nodes("th a") %>%
rvest::html_attr("href") %>%
paste0(main_url, .)

select_season <- cbind(seasons, seasons_urls) %>% data.frame() %>%
dplyr::filter(seasons == season) %>%
dplyr::pull(seasons_urls)

return(select_season)

}
21 changes: 21 additions & 0 deletions man/dot-get_league_season_url.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

14 changes: 14 additions & 0 deletions man/dot-get_tier1_competitions.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/get_match_results.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

26 changes: 26 additions & 0 deletions man/get_match_urls.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

20 changes: 20 additions & 0 deletions worldfootballR.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX

AutoAppendNewline: Yes
StripTrailingWhitespace: Yes

BuildType: Package
PackageUseDevtools: Yes
PackageInstallArgs: --no-multiarch --with-keep.source

0 comments on commit 9d3ee18

Please sign in to comment.