diff --git a/.DS_Store b/.DS_Store index aa5554ce..4625e334 100644 Binary files a/.DS_Store and b/.DS_Store differ diff --git a/5_Reproducible_Research/.DS_Store b/5_Reproducible_Research/.DS_Store index 324272e7..b6860cde 100644 Binary files a/5_Reproducible_Research/.DS_Store and b/5_Reproducible_Research/.DS_Store differ diff --git a/5_Reproducible_Research/project2/ReproducibleResearchProject2.html b/5_Reproducible_Research/project2/ReproducibleResearchProject2.html index e9505d08..73e81459 100644 --- a/5_Reproducible_Research/project2/ReproducibleResearchProject2.html +++ b/5_Reproducible_Research/project2/ReproducibleResearchProject2.html @@ -1,303 +1,363 @@ - - - - -
- - - - - - - - - -github repo for rest of specialization: Data Science Coursera
-The goal of the assignment is to explore the NOAA Storm Database and explore the effects of severe weather events on both population and economy.The database covers the time period between 1950 and November 2011.
-The analysis aims to investigate which different types of severe weather events are most harmful on the populations health in respect of general injuries and fatalities. Further the economic consequences will be analyzed by exploring the financial damage done to both general property and agriculture (i.e. crops)
-information on the data: Documentation
-Download the raw data file and extract the data into a dataframe.
-library("data.table")
-library(ggplot2)
-
-fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
-download.file(fileUrl, destfile = paste0("C:/Users/mgalarnyk/Desktop", '/repdata%2Fdata%2FStormData.csv.bz2'))
-stormDF <- read.csv("C:/Users/mgalarnyk/Desktop/repdata%2Fdata%2FStormData.csv.bz2")
-
-stormDT <- as.data.table(stormDF)
-colnames(stormDT)
-## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
-## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
-## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
-## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
-## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
-## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
-## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
-## [36] "REMARKS" "REFNUM"
-Subset the dataset on the parameters of interest. Basically, we remove the columns we don’t need
-# Finding columns to remove
-cols2Remove <- colnames(stormDT[, !c("EVTYPE"
- , "FATALITIES"
- , "INJURIES"
- , "PROPDMG"
- , "PROPDMGEXP"
- , "CROPDMG"
- , "CROPDMGEXP")])
-
-# Removing columns
-stormDT[, c(cols2Remove) := NULL]
-
-# Only get data where fatalities or injuries occurred.
-stormDT <- stormDT[(EVTYPE != "?" &
- (INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0)), c("EVTYPE"
- , "FATALITIES"
- , "INJURIES"
- , "PROPDMG"
- , "PROPDMGEXP"
- , "CROPDMG"
- , "CROPDMGEXP") ]
-# Change all damage exponents to uppercase.
-cols <- c("PROPDMGEXP", "CROPDMGEXP")
-stormDT[, (cols) := c(lapply(.SD, toupper)), .SDcols = cols]
-
-# Map property damage alphanumeric exponents to numeric values.
-propDmgKey <- c("\"\"" = 10^0,
- "-" = 10^0,
- "+" = 10^0,
- "0" = 10^0,
- "1" = 10^1,
- "2" = 10^2,
- "3" = 10^3,
- "4" = 10^4,
- "5" = 10^5,
- "6" = 10^6,
- "7" = 10^7,
- "8" = 10^8,
- "9" = 10^9,
- "H" = 10^2,
- "K" = 10^3,
- "M" = 10^6,
- "B" = 10^9)
-
-# Map crop damage alphanumeric exponents to numeric values
-cropDmgKey <- c("\"\"" = 10^0,
- "?" = 10^0,
- "0" = 10^0,
- "K" = 10^3,
- "M" = 10^6,
- "B" = 10^9)
-
-stormDT[,PROPDMGEXP := propDmgKey[as.character(stormDT[,PROPDMGEXP])]]
-stormDT[is.na(PROPDMGEXP), PROPDMGEXP := 10^0 ]
-
-stormDT[, CROPDMGEXP := cropDmgKey[as.character(stormDT[,CROPDMGEXP])] ]
-stormDT[is.na(CROPDMGEXP), CROPDMGEXP := 10^0 ]
-stormDT <- stormDT[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, propCost = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, cropCost = CROPDMG * CROPDMGEXP)]
-# Total injuries
-totalInjuriesDT <- stormDT[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), totals = sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
-
-totalInjuriesDT <- totalInjuriesDT[order(-FATALITIES), ]
-
-totalInjuriesDT <- totalInjuriesDT[1:10, ]
-
-head(totalInjuriesDT, 5)
-## EVTYPE FATALITIES INJURIES totals
-## 1: TORNADO 5633 91346 96979
-## 2: EXCESSIVE HEAT 1903 6525 8428
-## 3: FLASH FLOOD 978 1777 2755
-## 4: HEAT 937 2100 3037
-## 5: LIGHTNING 816 5230 6046
-melting data.table so that each earlier column name (fatalities, totals) and their associated value goes into the
-bad_stuff <- melt(totalInjuriesDT, id.vars="EVTYPE", variable.name = "bad_thing")
-# Create chart
-healthChart <- ggplot(bad_stuff, aes(x=reorder(EVTYPE, -value), y=value))
-
-# Plot data as bar chart
-healthChart = healthChart + geom_bar(stat="identity", aes(fill=bad_thing), position="dodge")
-
-# Format y-axis scale and set y-axis label
-healthChart = healthChart + scale_y_sqrt("Frequency Count")
-
-# Set x-axis label
-healthChart = healthChart + xlab("Event Type")
-
-# Rotate x-axis tick labels
-healthChart = healthChart + theme(axis.text.x = element_text(angle=45, hjust=1))
-
-# Set chart title
-healthChart = healthChart + ggtitle("Top 10 US Killers")
-
-healthChart
-
-Github repo for the Course: Reproducible Research Github repo for Rest of Specialization: Data Science Coursera
+The goal of the assignment is to explore the NOAA Storm Database and explore the effects of severe weather events on both population and economy.The database covers the time period between 1950 and November 2011.
+The following analysis investigates which types of severe weather events are most harmful on:
+Information on the Data: Documentation
+Download the raw data file and extract the data into a dataframe.Then convert to a data.table
+library("data.table")
+library("ggplot2")
+
+fileUrl <- "https://d396qusza40orc.cloudfront.net/repdata%2Fdata%2FStormData.csv.bz2"
+download.file(fileUrl, destfile = paste0("/Users/mgalarny/Desktop", '/repdata%2Fdata%2FStormData.csv.bz2'))
+stormDF <- read.csv("/Users/mgalarny/Desktop/repdata%2Fdata%2FStormData.csv.bz2")
+
+# Converting data.frame to data.table
+stormDT <- as.data.table(stormDF)
+colnames(stormDT)
+## [1] "STATE__" "BGN_DATE" "BGN_TIME" "TIME_ZONE" "COUNTY"
+## [6] "COUNTYNAME" "STATE" "EVTYPE" "BGN_RANGE" "BGN_AZI"
+## [11] "BGN_LOCATI" "END_DATE" "END_TIME" "COUNTY_END" "COUNTYENDN"
+## [16] "END_RANGE" "END_AZI" "END_LOCATI" "LENGTH" "WIDTH"
+## [21] "F" "MAG" "FATALITIES" "INJURIES" "PROPDMG"
+## [26] "PROPDMGEXP" "CROPDMG" "CROPDMGEXP" "WFO" "STATEOFFIC"
+## [31] "ZONENAMES" "LATITUDE" "LONGITUDE" "LATITUDE_E" "LONGITUDE_"
+## [36] "REMARKS" "REFNUM"
+Subset the dataset on the parameters of interest. Basically, we remove the columns we don’t need for clarity.
+# Finding columns to remove
+cols2Remove <- colnames(stormDT[, !c("EVTYPE"
+ , "FATALITIES"
+ , "INJURIES"
+ , "PROPDMG"
+ , "PROPDMGEXP"
+ , "CROPDMG"
+ , "CROPDMGEXP")])
+
+# Removing columns
+stormDT[, c(cols2Remove) := NULL]
+
+# Only use data where fatalities or injuries occurred.
+stormDT <- stormDT[(EVTYPE != "?" &
+ (INJURIES > 0 | FATALITIES > 0 | PROPDMG > 0 | CROPDMG > 0)), c("EVTYPE"
+ , "FATALITIES"
+ , "INJURIES"
+ , "PROPDMG"
+ , "PROPDMGEXP"
+ , "CROPDMG"
+ , "CROPDMGEXP") ]
+Making the PROPDMGEXP and CROPDMGEXP columns cleaner so they can be used to calculate property and crop cost.
+# Change all damage exponents to uppercase.
+cols <- c("PROPDMGEXP", "CROPDMGEXP")
+stormDT[, (cols) := c(lapply(.SD, toupper)), .SDcols = cols]
+
+# Map property damage alphanumeric exponents to numeric values.
+propDmgKey <- c("\"\"" = 10^0,
+ "-" = 10^0,
+ "+" = 10^0,
+ "0" = 10^0,
+ "1" = 10^1,
+ "2" = 10^2,
+ "3" = 10^3,
+ "4" = 10^4,
+ "5" = 10^5,
+ "6" = 10^6,
+ "7" = 10^7,
+ "8" = 10^8,
+ "9" = 10^9,
+ "H" = 10^2,
+ "K" = 10^3,
+ "M" = 10^6,
+ "B" = 10^9)
+
+# Map crop damage alphanumeric exponents to numeric values
+cropDmgKey <- c("\"\"" = 10^0,
+ "?" = 10^0,
+ "0" = 10^0,
+ "K" = 10^3,
+ "M" = 10^6,
+ "B" = 10^9)
+
+stormDT[, PROPDMGEXP := propDmgKey[as.character(stormDT[,PROPDMGEXP])]]
+stormDT[is.na(PROPDMGEXP), PROPDMGEXP := 10^0 ]
+
+stormDT[, CROPDMGEXP := cropDmgKey[as.character(stormDT[,CROPDMGEXP])] ]
+stormDT[is.na(CROPDMGEXP), CROPDMGEXP := 10^0 ]
+stormDT <- stormDT[, .(EVTYPE, FATALITIES, INJURIES, PROPDMG, PROPDMGEXP, propCost = PROPDMG * PROPDMGEXP, CROPDMG, CROPDMGEXP, cropCost = CROPDMG * CROPDMGEXP)]
+totalCostDT <- stormDT[, .(propCost = sum(propCost), cropCost = sum(cropCost), Total_Cost = sum(propCost) + sum(cropCost)), by = .(EVTYPE)]
+
+totalCostDT <- totalCostDT[order(-Total_Cost), ]
+
+totalCostDT <- totalCostDT[1:10, ]
+
+head(totalCostDT, 5)
+## EVTYPE propCost cropCost Total_Cost
+## 1: FLOOD 144657709807 5661968450 150319678257
+## 2: HURRICANE/TYPHOON 69305840000 2607872800 71913712800
+## 3: TORNADO 56947380676 414953270 57362333946
+## 4: STORM SURGE 43323536000 5000 43323541000
+## 5: HAIL 15735267513 3025954473 18761221986
+totalInjuriesDT <- stormDT[, .(FATALITIES = sum(FATALITIES), INJURIES = sum(INJURIES), totals = sum(FATALITIES) + sum(INJURIES)), by = .(EVTYPE)]
+
+totalInjuriesDT <- totalInjuriesDT[order(-FATALITIES), ]
+
+totalInjuriesDT <- totalInjuriesDT[1:10, ]
+
+head(totalInjuriesDT, 5)
+## EVTYPE FATALITIES INJURIES totals
+## 1: TORNADO 5633 91346 96979
+## 2: EXCESSIVE HEAT 1903 6525 8428
+## 3: FLASH FLOOD 978 1777 2755
+## 4: HEAT 937 2100 3037
+## 5: LIGHTNING 816 5230 6046
+Melting data.table so that it is easier to put in bar graph format
+bad_stuff <- melt(totalInjuriesDT, id.vars="EVTYPE", variable.name = "bad_thing")
+head(bad_stuff, 5)
+## EVTYPE bad_thing value
+## 1: TORNADO FATALITIES 5633
+## 2: EXCESSIVE HEAT FATALITIES 1903
+## 3: FLASH FLOOD FATALITIES 978
+## 4: HEAT FATALITIES 937
+## 5: LIGHTNING FATALITIES 816
+# Create chart
+healthChart <- ggplot(bad_stuff, aes(x=reorder(EVTYPE, -value), y=value))
+
+# Plot data as bar chart
+healthChart = healthChart + geom_bar(stat="identity", aes(fill=bad_thing), position="dodge")
+
+# Format y-axis scale and set y-axis label
+healthChart = healthChart + ylab("Frequency Count")
+
+# Set x-axis label
+healthChart = healthChart + xlab("Event Type")
+
+# Rotate x-axis tick labels
+healthChart = healthChart + theme(axis.text.x = element_text(angle=45, hjust=1))
+
+# Set chart title and center it
+healthChart = healthChart + ggtitle("Top 10 US Killers") + theme(plot.title = element_text(hjust = 0.5))
+
+healthChart
+
+Melting data.table so that it is easier to put in bar graph format
+econ_consequences <- melt(totalCostDT, id.vars="EVTYPE", variable.name = "Damage_Type")
+head(econ_consequences, 5)
+## EVTYPE Damage_Type value
+## 1: FLOOD propCost 144657709807
+## 2: HURRICANE/TYPHOON propCost 69305840000
+## 3: TORNADO propCost 56947380676
+## 4: STORM SURGE propCost 43323536000
+## 5: HAIL propCost 15735267513
+# Create chart
+econChart <- ggplot(econ_consequences, aes(x=reorder(EVTYPE, -value), y=value))
+
+# Plot data as bar chart
+econChart = econChart + geom_bar(stat="identity", aes(fill=Damage_Type), position="dodge")
+
+# Format y-axis scale and set y-axis label
+econChart = econChart + ylab("Cost (dollars)")
+
+# Set x-axis label
+econChart = econChart + xlab("Event Type")
+
+# Rotate x-axis tick labels
+econChart = econChart + theme(axis.text.x = element_text(angle=45, hjust=1))
+
+# Set chart title and center it
+econChart = econChart + ggtitle("Top 10 US Storm Events causing Economic Consequences") + theme(plot.title = element_text(hjust = 0.5))
+
+econChart
+
+