-
Notifications
You must be signed in to change notification settings - Fork 0
/
Data Cleaning Final.R
131 lines (119 loc) · 8.67 KB
/
Data Cleaning Final.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
library(tidyr)
library(dplyr)
library(stringr)
#Loading all initial CSV files before cleaning them
# World Bank's country climate change characteristics from 1990-2011
wb_df <- read.csv("Initial CSV Files/World Bank Climate Change.csv")
#Berkeley Earth's country surface monthly average temperature from 1743 to 2013
berk_df <- read.csv("Initial CSV Files/GlobalLandTemperaturesByCountry.csv")
#Economic Dataset from WEO sorted by country from 1990 to 2020
weo_country_df <- read.csv("Initial CSV Files/WEO by country.csv")
#Summarizing the World Bank's Climate Change table to get CO2 Emissions Value
#converting all the year values to be rows and not columns
wb_df <- wb_df %>%
pivot_longer(
cols = starts_with("X"), #Selects all columns that start with X
names_to = "Year", # This is the new column that will contain the year values
values_to = "Value",
names_prefix = "X" # This removes the 'X' prefix from the year column names
) %>%
mutate(
Year = as.integer(gsub("X", "", Year)) # Convert the year to an integer and remove the 'X' prefix
) %>% select(Country.name,Series.name,Year,Value)
#filtering to only keep the CO2 Emissions part of the dataframe
wb_df <- wb_df %>% filter(Series.name == "CO2 emissions per capita (metric tons)" | Series.name == "Cereal yield (kg per hectare)" )
#converting all the CO2 Emissions from being rows to being columns
wb_df <- wb_df %>%
pivot_wider(
names_from = "Series.name",
values_from = "Value"
)
#changing all the values to be numeric and not characters and replacing NA values with 0
wb_df <- wb_df %>%
mutate(`Cereal yield (kg per hectare)` = gsub("\\.\\.", 0, `Cereal yield (kg per hectare)`)) %>%
mutate(`Cereal yield (kg per hectare)` = gsub("[^0-9.]", "", `Cereal yield (kg per hectare)`)) %>%
mutate(`Cereal yield (kg per hectare)` = as.numeric(`Cereal yield (kg per hectare)`))
wb_df <- wb_df %>%
mutate(`CO2 emissions per capita (metric tons)` = gsub("\\.\\.", 0, `CO2 emissions per capita (metric tons)`)) %>%
mutate(`CO2 emissions per capita (metric tons)` = gsub("[^0-9.]", "", `CO2 emissions per capita (metric tons)`)) %>%
mutate(`CO2 emissions per capita (metric tons)` = as.numeric(`CO2 emissions per capita (metric tons)`))
#Summarizing the berk dataset to get the Average Temperature and Average Uncertainty of each year for each country from 1990 to 2010.
# filering to keep only the years 1990 to 2010 as that is kept consistent with the WEO dataset
berk_df <- berk_df %>% mutate(dt = as.integer(str_sub(dt,1,4)))
berk_df <- berk_df %>% filter((str_starts(dt, "19") | str_starts(dt, "20")) & !str_starts(dt, "2011") & !str_starts(dt, "2012") & !str_starts(dt, "2013"))
# creating a new column "Group" that will indicate which year group and country group a the averaged volumn will belong to. (Since we know there is 12 months a year)
berk_df <- berk_df %>% mutate(Groups = ceiling(row_number()/12))
#calculating the average temperature based on the groups
average_temp <- berk_df %>% group_by(Groups) %>% summarize(Average_temp = mean(AverageTemperature,na.rm = TRUE),Average_temp_unct = mean(AverageTemperatureUncertainty,na.rm = TRUE))
#joining the two tables together
berk_df <- berk_df %>% left_join(average_temp,by=c("Groups"="Groups"))
#choosing only the first row of every group so get 1 value each year for each country
berk_df <- berk_df %>% group_by(Groups) %>% slice(1) %>% select(Groups,dt,Country,Average_temp,Average_temp_unct)
#Summarizing the WEO dataset for GDP, Population, and total Investment
#change the years from being columns to rows and selecting only important columns that we will need
weo_country_df <- weo_country_df %>%
pivot_longer(
cols = starts_with("X"), #Selects all columns that start with X
names_to = "Year", # This is the new column that will contain the year values
values_to = "Value",
names_prefix = "X" # This removes the 'X' prefix from the year column names
) %>%
mutate(
Year = as.integer(gsub("X", "", Year)) # Convert the year to an integer and remove the 'X' prefix
) %>% select(Country,Subject.Descriptor,Units,Scale,Year,Value)
#keep only the characteristics that we want to analyze by itself so we can change them to columns later
GDP <- weo_country_df %>% filter(Subject.Descriptor == "Gross domestic product, current prices")
Population <- weo_country_df %>% filter(Subject.Descriptor == "Population")
Investment <- weo_country_df %>% filter(Subject.Descriptor == "Total investment")
#changing each characteristics into columns and not rows
GDP <- GDP %>%
pivot_wider(
names_from = "Subject.Descriptor",
values_from = "Value"
) %>% rename("gdp"="Gross domestic product, current prices")
Population <- Population %>%
pivot_wider(
names_from = "Subject.Descriptor",
values_from = "Value"
)
Investment <- Investment %>%
pivot_wider(
names_from = "Subject.Descriptor",
values_from = "Value"
)
#joining the tables together
weo_country_df <- inner_join(GDP,Population,by=c("Country" = "Country","Year"="Year"))
weo_country_df <- inner_join(weo_country_df,Investment,by=c("Country" = "Country","Year"="Year"))
#Changing the Characteristics to contain units and scale by adding the scale and units column to the characterisitcs
weo_country_df <- weo_country_df %>% mutate(Units.x = paste(Units.x," (Billions)"),Units.y = paste(Units.y," (Millions)"))
weo_country_df <- weo_country_df %>% select(Country,Year,gdp,Units.x,Population,Units.y,`Total investment`,Units)
weo_country_df <- weo_country_df %>% rename("Gdp_units" = "Units.x", "Population_units" = "Units.y","
Total_invest_units"="Units","Gdp" = "gdp")
#Creating the new column GDP per Capita
weo_country_df <- weo_country_df %>% mutate(Gdp = gsub("[^0-9\\.]", "", Gdp), Gdp = as.numeric(Gdp), Population = gsub("[^0-9]", "", Population), Population = as.numeric(Population)) %>% mutate(Gdp_per_cap = if_else(is.na(Gdp) | is.na(Population), NA_real_, (Gdp * 1000) / Population))
weo_country_df <- weo_country_df %>% mutate(Gdp = if_else(is.na(Gdp), 0, Gdp),Population = if_else(is.na(Population), 0, Population),Gdp_per_cap = (Gdp * 1000) / Population)
weo_country_df <- weo_country_df %>% filter(!str_starts(Year,"198"))
#joining all of the tables together to be one final dataset
final_df <- left_join(berk_df,wb_df,by=c("Country"="Country.name","dt"="Year"))
final_df <- inner_join(weo_country_df,final_df,by=c("Country"="Country","Year"="dt"))
#Classifying countries and years based on average temperature
final_df <- final_df %>% mutate(climate_zone = case_when(Average_temp > 18 ~ "Tropical", Average_temp <= 18 & Average_temp > 10 ~ "Temperate", Average_temp<= 10 ~ "Cold", TRUE ~ "Not Classified"))
#Classifying countries based on their income level
final_df <- final_df %>% mutate(economic_status = case_when(Gdp < 10 ~ "Low-income",Gdp >= 10 & Gdp < 50 ~ "Lower-middle-income",Gdp >= 50 & Gdp < 200 ~ "Upper-middle-income",Gdp >= 200 ~ "High-income",TRUE ~ NA_character_))
# Classifying Development level based on GDP per capita and total investment percent
final_df <- final_df %>% mutate(development_status = case_when(Gdp_per_cap > 20 & `Total investment` > 20 ~ "Developed", Gdp_per_cap >= 5 & Gdp_per_cap <= 20 & `Total investment` >= 15 & `Total investment` <= 20 ~ "Developing",TRUE ~ "Developing" ))
#Classifying countries based on their CO2 emissions rate
final_df <- final_df %>% mutate(co2_category = case_when(`CO2 emissions per capita (metric tons)` < 2 ~ "Low emissions",`CO2 emissions per capita (metric tons)` >= 2 & `CO2 emissions per capita (metric tons)` < 10 ~ "Medium emissions",`CO2 emissions per capita (metric tons)` >= 10 & `CO2 emissions per capita (metric tons)` < 20 ~ "High emissions",`CO2 emissions per capita (metric tons)` >= 20 ~ "Very high emissions",TRUE ~ "Not Classified"))
#Classifying countries based on their population
final_df <- final_df %>% mutate(population_category = case_when(Population < 5 ~ "Small population",Population >= 5 & Population < 50 ~ "Medium population",Population >= 50 ~ "Large population",TRUE ~ "Not Classified"))
#renaming the columns in the dataframe
final_df <- final_df %>% rename("CO2_emissions"=`CO2 emissions per capita (metric tons)`,"Total_investment" = `Total investment`,"Cereal_yield" = `Cereal yield (kg per hectare)`)
#making it so that all the 0 and NaN values are all changed to NA properly, so it is easy to filter them later on
final_df$Gdp[final_df$Gdp == 0] <- NA
final_df$Population[final_df$Population == 0] <- NA
final_df$Total_investment[final_df$Total_investment == "n/a"] <- NA
final_df$Gdp_per_cap[final_df$Gdp_per_cap == "NaN"] <- NA
final_df$Cereal_yield[final_df$Cereal_yield == 0] <- NA
final_df$CO2_emissions[final_df$CO2_emissions == 0] <- NA
#saving the final dataframe as a csv file
write.csv(final_df,"final_df.csv")