pastebin
: Tools to work with the pastebin API
Pastebin is a website where you can store any text online for easy sharing. The website is mainly used by programmers to store pieces of sources code or configuration information, but anyone is more than welcome to paste any type of text. The idea behind the site is to make it more convenient for people to share large amounts of text online.
WIP!! The package API will very likely be in rapid change mode for a bit
The following functions are implemented:
get_paste
: Get raw paste dataget_paste_metadata
: Get paste metadataget_trending_pastes
: Get trending pastesget_recent_pastes
: Get recent pastesnew_paste
: Create a new pastepastebin_api_key
: Get or set PASTEBIN_API_KEY valuetoString.paste
: Extract just the paste text from a paste objectas.character.paste
: Extract just the paste text from a paste object
- Paste as user
- Finish API coverage including "Pro"" paste features
- Testing
devtools::install_github("hrbrmstr/pastebin")
options(width=120)
library(pastebin)
library(tidyverse)
# current verison
packageVersion("pastebin")
## [1] '0.1.0'
get_trending_pastes() %>%
arrange(desc(hits))
## # A tibble: 18 × 10
## key date title size expire_date private format_short
## <chr> <dttm> <chr> <dbl> <dttm> <lgl> <chr>
## 1 sneAjEtZ 2017-02-03 03:55:22 DCW List - As of Feb 3 2017, 3:30PM 49001 <NA> FALSE text
## 2 y9P19guS 2017-02-02 12:09:22 a backdoor with backdoors 2235 <NA> FALSE text
## 3 pKDfBzxL 2017-02-02 18:18:09 83177 <NA> FALSE text
## 4 pXRWThRZ 2017-02-01 22:09:06 2583 <NA> FALSE text
## 5 2FDzA38q 2017-02-02 05:04:28 177 <NA> FALSE text
## 6 ck9y4Fsr 2017-02-02 19:50:56 [DOC/JS threat] Uploaded by @JohnLaTwC 55510 <NA> FALSE javascript
## 7 hfe0RmkZ 2017-02-03 19:20:58 DROPBOX DATABASE LEAKED 9973 <NA> FALSE text
## 8 YmEf4Lg3 2017-02-04 13:24:39 69780 <NA> FALSE text
## 9 gZbumgyx 2017-02-04 04:11:01 NESMania Game Replays 4165 <NA> FALSE text
## 10 QJL60dNC 2017-02-02 03:36:33 02/02/2017 - KTOS 1159 <NA> FALSE text
## 11 BMyYKAWQ 2017-02-03 14:39:09 INFERNO RULES DUMP 5542 <NA> FALSE text
## 12 g28xPFqf 2017-02-02 21:08:59 OPDeathEathers JTSEC full recon #99 187357 <NA> FALSE text
## 13 fSv8esiY 2017-02-03 20:45:33 900 <NA> FALSE text
## 14 hWDX5cxi 2017-02-02 11:19:23 UPDATED 03/02/2017 CSGO500 24006 <NA> FALSE text
## 15 9nF9RHSm 2017-02-03 19:13:25 ytsurp555 is a virus. 74 <NA> FALSE text
## 16 J9NTXjYh 2017-02-05 05:49:22 #FreeOurSisters 2466 <NA> FALSE text
## 17 jPvffpFW 2017-02-02 10:28:17 leaked 4774 <NA> FALSE text
## 18 kcJT0nT4 2017-02-02 19:26:39 new list sssss 32396 <NA> FALSE text
## # ... with 3 more variables: format_long <chr>, url <chr>, hits <dbl>
r_pastes <- get_recent_pastes(lang="rsplus")
glimpse(r_pastes)
## Observations: 50
## Variables: 9
## $ scrape_url <chr> "http://pastebin.com/api_scrape_item.php?i=DFH545DQ", "http://pastebin.com/api_scrape_item.php?i...
## $ full_url <chr> "http://pastebin.com/DFH545DQ", "http://pastebin.com/LmNWvVzW", "http://pastebin.com/yuE9WC4T", ...
## $ date <dttm> 2017-02-05 15:01:29, 2017-02-05 13:49:11, 2017-02-05 13:10:12, 2017-02-05 12:20:30, 2017-02-05 ...
## $ key <chr> "DFH545DQ", "LmNWvVzW", "yuE9WC4T", "EruPnWy9", "dw7zsagE", "9zTdrZsK", "UhDLx67h", "HL9KUbPT", ...
## $ size <dbl> 447, 1613, 402, 2804, 1543, 447, 472, 447, 472, 472, 447, 422, 417, 402, 427, 1633, 1204, 350, 1...
## $ expire <dttm> NA, NA, NA, NA, 2017-02-12 03:34:16, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA...
## $ title <chr> "Black Sails S04E02 WEBRip X264-DEFLATE", "", "Ransom S01E05 HDTV x264-FLEET", "ues", "RRR", "De...
## $ syntax <chr> "rsplus", "rsplus", "rsplus", "rsplus", "rsplus", "rsplus", "rsplus", "rsplus", "rsplus", "rsplu...
## $ user <chr> "AllRls_net", "", "AllRls_net", "", "", "AllRls_net", "AllRls_net", "AllRls_net", "AllRls_net", ...
Can't always trust the lang
setting. Some non-R stuff in there:
walk(r_pastes$key[1:10], ~print(toString(get_paste(.))))
## [1] "\n\" Black Sails S04E02 WEBRip X264-DEFLATE \"\n# Filename: Black.Sails.S04E02.WEBRip.X264-DEFLATE.mkv\n# Size: 399.99 MB\n\n\" Uploaded \"\nhttp:/uploaded.net/file/2hnjglq2/Black.Sails.S04E02.WEBRip.X264-DEFLATE.mkv\n\n\" Rapidgator \"\nhttp:/rapidgator.net/file/44cb773e21675407fd563946e4421299/Black.Sails.S04E02.WEBRip.X264-DEFLATE.mkv.html\n\n\" Uploadrocket \"\nhttp:/uploadrocket.net/5c26avxw1r8t/Black.Sails.S04E02.WEBRip.X264-DEFLATE.mkv.html"
## [1] "t1 <- proc.time()\r\n\r\ndata <- read.csv(\"dataset.csv\")\r\ndata <- data[data$class == 13, ] # load data, only 13 class\r\ndata <- data[data$attendance != 0, ]\r\ndata <- data[!duplicated(data[6:32]),]\r\n#data <- unique(data) # unique\r\n\r\n#difficulty of 2,3 instr\r\ni <- data[data$instr == 2, ]\r\nmeans <- colMeans(i) # means of all columns\r\ndeviations <- colSds(as.matrix(i)) # standard deviations\r\n\r\ni <- data[data$instr == 3, ]\r\nmeans <- colMeans(i) # means of all columns\r\ndeviations <- colSds(as.matrix(i)) # standard deviations\r\n\r\nsums <- colSums(data[1:33]) # sums of all columns\r\nmeans <- colMeans(data) # means of all columns\r\nmedians <- colMedians(as.matrix(data[1:33]))# medians\r\ndeviations <- colSds(as.matrix(data[1:33])) # standard deviations\r\ndispersions <- deviations*deviations # dispersions\r\nquantiles <- colQuantiles(as.matrix(data[1:33])) #quantilies\r\nmins <- colMins(as.matrix(data[1:33])) # min of each column\r\nmaxs <- colMaxs(as.matrix(data[1:33])) # max of each column\r\ncor <- cor(data[6:33]) # variance covariance matrix\r\ndet(cov(data[6:33]))\r\n\r\nproc.time() - t1\r\n\r\n#graphic\r\nbarplot(data.frame(table(i$difficulty))[,2],names.arg = \"Difficulty\",col = \"blue\")\r\nbarplot(data.frame(table(i$difficulty))[,2],names.arg = \"Difficulty\",col = \"blue\")\r\nbarplot(sums,col = \"blue\")\r\nbarplot(medians,col = \"blue\")\r\nbarplot(maxs,col=\"blue\")\r\nbarplot(quantiles[,1],col = \"blue\")\r\nbarplot(quantiles[,2],col = \"blue\")\r\nbarplot(quantiles[,3],col = \"blue\")\r\nbarplot(as.vector(table(as.vector(as.matrix(data[,6:33])))),col=\"blue\")\r\nboxplot(data[6:32],col = \"blue\")\r\nplot(sort(rowSums(data[6:33])),col=\"blue\",type = \"s\")"
## [1] "\n\" Ransom S01E05 HDTV x264-FLEET \"\n# Filename: Ransom.S01E05.HDTV.x264-FLEET.mkv\n# Size: 221.49 MB\n\n\" Uploaded \"\nhttp:/uploaded.net/file/024rgxm7/Ransom.S01E05.HDTV.x264-FLEET.mkv\n\n\" Rapidgator \"\nhttp:/rapidgator.net/file/8ea37e4113156c376f0dc69dfa5b9abc/Ransom.S01E05.HDTV.x264-FLEET.mkv.html\n\n\" Uploadrocket \"\nhttp:/uploadrocket.net/w3fd0sai0u09/Ransom.S01E05.HDTV.x264-FLEET.mkv.html"
## [1] "library(rvest)\r\nURL <- \"https://archive.ics.uci.edu/ml/machine-learning-databases/eeg-mld/eeg_full/\"\r\n\r\npg <- read_html(URL)\r\nresult1 <- pg %>% html_nodes(\"a\") %>% html_attr(\"href\")\r\nresult2 <- result1[6:length(result1)]\r\n\r\nG <- 71 # asci for G\r\nset.seed(G)\r\nnAplus <- 10\r\nnA <- 5\r\nnBC <- 2\r\nmyGroupn <- nAplus\r\nusersToRead <- sample(1:length(result2),myGroupn,replace = FALSE)\r\nresult2[usersToRead]\r\n\r\n#418, 339, 417, 400, 415, 450, 351, 392, 416, 428\r\n\r\nuntar(\"co3a0000450.tar.gz\")\r\nuntar(\"co2c0000392.tar.gz\")\r\nuntar(\"co2c0000351.tar.gz\")\r\nuntar(\"co2c0000339.tar.gz\")\r\nuntar(\"co2a0000440.tar.gz\")\r\nuntar(\"co2a0000428.tar.gz\")\r\nuntar(\"co2a0000418.tar.gz\")\r\nuntar(\"co2a0000417.tar.gz\")\r\nuntar(\"co2a0000416.tar.gz\")\r\nuntar(\"co2a0000415.tar.gz\")\r\n\r\n#library(data.table)\r\n#list <- c(\"co3a0000450/\",\"co2c0000392/\",\"co2c0000351/\",\"co2c0000339/\",\"co2a0000440/\",\"co2a0000428/\",\"co2a0000418/\",\"co2a0000417/\",\"co2a0000416/\",\"co2a0000415/\")\r\n\r\nrep1 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co3a0000450/\",pattern = \".gz\")\r\nrep2 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2c0000392/\",pattern = \".gz\")\r\nrep3 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2c0000351/\",pattern = \".gz\")\r\nrep4 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2c0000339/\",pattern = \".gz\")\r\nrep5 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000440/\",pattern = \".gz\")\r\nrep6 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000428/\",pattern = \".gz\")\r\nrep7 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000418/\",pattern = \".gz\")\r\nrep8 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000417/\",pattern = \".gz\")\r\nrep9 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000416/\",pattern = \".gz\")\r\nrep10 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000415/\",pattern = \".gz\")\r\n\r\ncount<-0\r\nfor (i in rep1){\r\n \r\n a <- gzfile(paste(\"C:/Users/raulg/Desktop/statistics/co3a0000450/\",i,sep=\"\"))\r\n res <- readLines(a)\r\n for (i in res){\r\n id<-paste(paste(strsplit(res[1],\"\")[[1]][11],strsplit(res[1],\"\")[[1]][12]),strsplit(res[1],\"\")[[1]][13])\r\n id<-gsub(\" \", \"\", id, fixed = TRUE)\r\n if(strsplit(res[1],\"\")[[1]][3]==\"a\"){\r\n alch <- \"alch\"}\r\n if(strsplit(res[1],\"\")[[1]][3]!=\"a\"){\r\n alch <- \"nonAl\"}\r\n \r\n lal <- data.frame(toString(alch),\r\n id,\r\n paste(paste(strsplit(res[4],\" \")[[1]][2],strsplit(res[4],\" \")[[1]][3]),\r\n strsplit(res[8],\" \")[[1]][1]),\r\n strsplit(res[4],\" \")[[1]][6],\r\n strsplit(res[2],\" \")[[1]][4],\r\n count,\r\n strsplit(res[3],\" \")[[1]][2])\r\n df <- rbind.data.frame(df,lal)\r\n count <- count +1\r\n print(i)\r\n if(count == 256){\r\n count<-0\r\n }\r\n}}"
## [1] "t1 <- proc.time()\r\nsetwd(\"C:\\\\Users\\\\Nikita\\\\Downloads\\\\r_lab\")\r\ntabl <- read.csv(file=\"turkiye.csv\", header = TRUE)\r\ntabl <- tabl[tabl$class == 10, ]\r\ntabl <- tabl[tabl$attendance != 0, ]\r\ntabl <- tabl[!duplicated(tabl[6:33]),]\r\n\r\nfor (i in 1:33)\r\n{\r\n sum[i] <- sum(tabl[ ,i])\r\n min[i] <- min(tabl[ ,i])\r\n max[i] <- max(tabl[ ,i])\r\n medians[i] <- median(tabl[ ,i])\r\n sd[i] <- sd(tabl[ ,i])\r\n var[i] <- var(tabl[ ,i])\r\n quantile_25[i] <- quantile(tabl[ ,i], probs = c(0.25))\r\n quantile_50[i] <- quantile(tabl[ ,i], probs = c(0.50))\r\n quantile_75[i] <- quantile(tabl[ ,i], probs = c(0.75))\r\n}\r\ncor <- cor(tabl[ ,3:33])\r\ncov <- cov(tabl[ ,3:33])\r\nproc.time() - t1\r\n\r\nsum_npPeret <- sum(tabl[ ,3])\r\nmin_npPeret <- min(tabl[ ,3])\r\nmax_npPeret <- max(tabl[ ,3])\r\nmean_npPeret <- mean(tabl[ ,3])\r\nvar_npPeret <- var(tabl[ ,3])\r\nsd_npPeret <- sd(tabl[ ,3])\r\nquantile_npPeret <- quantile(tabl[ ,3],probs = c(0.25,0.5,0.75))\r\nmedian_npPeret <- median(tabl[ ,3])\r\n\r\nsum_difficulty <- sum(tabl[ ,5])\r\nmin_difficulty <- min(tabl[ ,5])\r\nmax_difficulty <- max(tabl[ ,5])\r\nmean_difficulty <- mean(tabl[ ,5])\r\nvar_difficulty <- var(tabl[ ,5])\r\nsd_difficulty <- sd(tabl[ ,5])\r\nquantile_difficulty <- quantile(tabl[ ,5],probs = c(0.25,0.5,0.75))\r\nmedian_difficulty <- median(tabl[ ,5])\r\n\r\nsum_Q15 <- sum(tabl[ ,20])\r\nmin_Q15 <- min(tabl[ ,20])\r\nmax_Q15 <- max(tabl[ ,20])\r\nmean_Q15 <- mean(tabl[ ,20])\r\nvar_Q15 <- var(tabl[ ,20])\r\nsd_Q15 <- sd(tabl[ ,20])\r\nquantile_Q15 <- quantile(tabl[ ,20],probs = c(0.25,0.5,0.75))\r\nmedian_Q15 <- median(tabl[ ,20])"
## [1] "\n\" Detroit Steel S01E02 HDTV x264-KILLERS \"\n# Filename: Detroit.Steel.S01E02.HDTV.x264-KILLERS.mkv\n# Size: 286.64 MB\n\n\" Uploaded \"\nhttp:/uploaded.net/file/qaxov4n4/Detroit.Steel.S01E02.HDTV.x264-KILLERS.mkv\n\n\" Rapidgator \"\nhttp:/rapidgator.net/file/88ff8edf5def48d8549de1ab08d974b2/Detroit.Steel.S01E02.HDTV.x264-KILLERS.mkv.html\n\n\" Uploadrocket \"\nhttp:/uploadrocket.net/9pergvrzj8u1/Detroit.Steel.S01E02.HDTV.x264-KILLERS.mkv.html"
## [1] "\n\" Detroit Steel S01E02 720p HDTV x264-KILLERS \"\n# Filename: Detroit.Steel.S01E02.720p.HDTV.x264-KILLERS.mkv\n# Size: 921.80 MB\n\n\" Uploaded \"\nhttp:/uploaded.net/file/1fys32s3/Detroit.Steel.S01E02.720p.HDTV.x264-KILLERS.mkv\n\n\" Rapidgator \"\nhttp:/rapidgator.net/file/da4846e7b19b3b8fe7b3a1a6269788a0/Detroit.Steel.S01E02.720p.HDTV.x264-KILLERS.mkv.html\n\n\" Uploadrocket \"\nhttp:/uploadrocket.net/dah9gz1owi3n/Detroit.Steel.S01E02.720p.HDTV.x264-KILLERS.mkv.html"
## [1] "\n\" Counting Cars S07E04 HDTV x264-KILLERS \"\n# Filename: Counting.Cars.S07E04.HDTV.x264-KILLERS.mkv\n# Size: 190.96 MB\n\n\" Uploaded \"\nhttp:/uploaded.net/file/lyzl1ydr/Counting.Cars.S07E04.HDTV.x264-KILLERS.mkv\n\n\" Rapidgator \"\nhttp:/rapidgator.net/file/86e7dd0341c627ccac35d51087f677c4/Counting.Cars.S07E04.HDTV.x264-KILLERS.mkv.html\n\n\" Uploadrocket \"\nhttp:/uploadrocket.net/aacdv5hcggt7/Counting.Cars.S07E04.HDTV.x264-KILLERS.mkv.html"
## [1] "\n\" Counting Cars S07E04 720p HDTV x264-KILLERS \"\n# Filename: Counting.Cars.S07E04.720p.HDTV.x264-KILLERS.mkv\n# Size: 631.49 MB\n\n\" Uploaded \"\nhttp:/uploaded.net/file/8mbzb83g/Counting.Cars.S07E04.720p.HDTV.x264-KILLERS.mkv\n\n\" Rapidgator \"\nhttp:/rapidgator.net/file/32070ef793f42a8307d21fe3237dedde/Counting.Cars.S07E04.720p.HDTV.x264-KILLERS.mkv.html\n\n\" Uploadrocket \"\nhttp:/uploadrocket.net/qsyzf4nlg2vp/Counting.Cars.S07E04.720p.HDTV.x264-KILLERS.mkv.html"
## [1] "\n\" Counting Cars S07E03 720p HDTV x264-KILLERS \"\n# Filename: Counting.Cars.S07E03.720p.HDTV.x264-KILLERS.mkv\n# Size: 583.19 MB\n\n\" Uploaded \"\nhttp:/uploaded.net/file/xqhf6css/Counting.Cars.S07E03.720p.HDTV.x264-KILLERS.mkv\n\n\" Rapidgator \"\nhttp:/rapidgator.net/file/7471aebb6fe68878bcd38fafa192e831/Counting.Cars.S07E03.720p.HDTV.x264-KILLERS.mkv.html\n\n\" Uploadrocket \"\nhttp:/uploadrocket.net/0jvn701so1uq/Counting.Cars.S07E03.720p.HDTV.x264-KILLERS.mkv.html"
Since the user is obvious:
mebbe_r <- filter(r_pastes, user != "AllRls_net")
walk(mebbe_r$key, ~print(toString(get_paste(.))))
## [1] "t1 <- proc.time()\r\n\r\ndata <- read.csv(\"dataset.csv\")\r\ndata <- data[data$class == 13, ] # load data, only 13 class\r\ndata <- data[data$attendance != 0, ]\r\ndata <- data[!duplicated(data[6:32]),]\r\n#data <- unique(data) # unique\r\n\r\n#difficulty of 2,3 instr\r\ni <- data[data$instr == 2, ]\r\nmeans <- colMeans(i) # means of all columns\r\ndeviations <- colSds(as.matrix(i)) # standard deviations\r\n\r\ni <- data[data$instr == 3, ]\r\nmeans <- colMeans(i) # means of all columns\r\ndeviations <- colSds(as.matrix(i)) # standard deviations\r\n\r\nsums <- colSums(data[1:33]) # sums of all columns\r\nmeans <- colMeans(data) # means of all columns\r\nmedians <- colMedians(as.matrix(data[1:33]))# medians\r\ndeviations <- colSds(as.matrix(data[1:33])) # standard deviations\r\ndispersions <- deviations*deviations # dispersions\r\nquantiles <- colQuantiles(as.matrix(data[1:33])) #quantilies\r\nmins <- colMins(as.matrix(data[1:33])) # min of each column\r\nmaxs <- colMaxs(as.matrix(data[1:33])) # max of each column\r\ncor <- cor(data[6:33]) # variance covariance matrix\r\ndet(cov(data[6:33]))\r\n\r\nproc.time() - t1\r\n\r\n#graphic\r\nbarplot(data.frame(table(i$difficulty))[,2],names.arg = \"Difficulty\",col = \"blue\")\r\nbarplot(data.frame(table(i$difficulty))[,2],names.arg = \"Difficulty\",col = \"blue\")\r\nbarplot(sums,col = \"blue\")\r\nbarplot(medians,col = \"blue\")\r\nbarplot(maxs,col=\"blue\")\r\nbarplot(quantiles[,1],col = \"blue\")\r\nbarplot(quantiles[,2],col = \"blue\")\r\nbarplot(quantiles[,3],col = \"blue\")\r\nbarplot(as.vector(table(as.vector(as.matrix(data[,6:33])))),col=\"blue\")\r\nboxplot(data[6:32],col = \"blue\")\r\nplot(sort(rowSums(data[6:33])),col=\"blue\",type = \"s\")"
## [1] "library(rvest)\r\nURL <- \"https://archive.ics.uci.edu/ml/machine-learning-databases/eeg-mld/eeg_full/\"\r\n\r\npg <- read_html(URL)\r\nresult1 <- pg %>% html_nodes(\"a\") %>% html_attr(\"href\")\r\nresult2 <- result1[6:length(result1)]\r\n\r\nG <- 71 # asci for G\r\nset.seed(G)\r\nnAplus <- 10\r\nnA <- 5\r\nnBC <- 2\r\nmyGroupn <- nAplus\r\nusersToRead <- sample(1:length(result2),myGroupn,replace = FALSE)\r\nresult2[usersToRead]\r\n\r\n#418, 339, 417, 400, 415, 450, 351, 392, 416, 428\r\n\r\nuntar(\"co3a0000450.tar.gz\")\r\nuntar(\"co2c0000392.tar.gz\")\r\nuntar(\"co2c0000351.tar.gz\")\r\nuntar(\"co2c0000339.tar.gz\")\r\nuntar(\"co2a0000440.tar.gz\")\r\nuntar(\"co2a0000428.tar.gz\")\r\nuntar(\"co2a0000418.tar.gz\")\r\nuntar(\"co2a0000417.tar.gz\")\r\nuntar(\"co2a0000416.tar.gz\")\r\nuntar(\"co2a0000415.tar.gz\")\r\n\r\n#library(data.table)\r\n#list <- c(\"co3a0000450/\",\"co2c0000392/\",\"co2c0000351/\",\"co2c0000339/\",\"co2a0000440/\",\"co2a0000428/\",\"co2a0000418/\",\"co2a0000417/\",\"co2a0000416/\",\"co2a0000415/\")\r\n\r\nrep1 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co3a0000450/\",pattern = \".gz\")\r\nrep2 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2c0000392/\",pattern = \".gz\")\r\nrep3 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2c0000351/\",pattern = \".gz\")\r\nrep4 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2c0000339/\",pattern = \".gz\")\r\nrep5 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000440/\",pattern = \".gz\")\r\nrep6 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000428/\",pattern = \".gz\")\r\nrep7 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000418/\",pattern = \".gz\")\r\nrep8 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000417/\",pattern = \".gz\")\r\nrep9 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000416/\",pattern = \".gz\")\r\nrep10 <- list.files(path = \"C:/Users/raulg/Desktop/statistics/co2a0000415/\",pattern = \".gz\")\r\n\r\ncount<-0\r\nfor (i in rep1){\r\n \r\n a <- gzfile(paste(\"C:/Users/raulg/Desktop/statistics/co3a0000450/\",i,sep=\"\"))\r\n res <- readLines(a)\r\n for (i in res){\r\n id<-paste(paste(strsplit(res[1],\"\")[[1]][11],strsplit(res[1],\"\")[[1]][12]),strsplit(res[1],\"\")[[1]][13])\r\n id<-gsub(\" \", \"\", id, fixed = TRUE)\r\n if(strsplit(res[1],\"\")[[1]][3]==\"a\"){\r\n alch <- \"alch\"}\r\n if(strsplit(res[1],\"\")[[1]][3]!=\"a\"){\r\n alch <- \"nonAl\"}\r\n \r\n lal <- data.frame(toString(alch),\r\n id,\r\n paste(paste(strsplit(res[4],\" \")[[1]][2],strsplit(res[4],\" \")[[1]][3]),\r\n strsplit(res[8],\" \")[[1]][1]),\r\n strsplit(res[4],\" \")[[1]][6],\r\n strsplit(res[2],\" \")[[1]][4],\r\n count,\r\n strsplit(res[3],\" \")[[1]][2])\r\n df <- rbind.data.frame(df,lal)\r\n count <- count +1\r\n print(i)\r\n if(count == 256){\r\n count<-0\r\n }\r\n}}"
## [1] "t1 <- proc.time()\r\nsetwd(\"C:\\\\Users\\\\Nikita\\\\Downloads\\\\r_lab\")\r\ntabl <- read.csv(file=\"turkiye.csv\", header = TRUE)\r\ntabl <- tabl[tabl$class == 10, ]\r\ntabl <- tabl[tabl$attendance != 0, ]\r\ntabl <- tabl[!duplicated(tabl[6:33]),]\r\n\r\nfor (i in 1:33)\r\n{\r\n sum[i] <- sum(tabl[ ,i])\r\n min[i] <- min(tabl[ ,i])\r\n max[i] <- max(tabl[ ,i])\r\n medians[i] <- median(tabl[ ,i])\r\n sd[i] <- sd(tabl[ ,i])\r\n var[i] <- var(tabl[ ,i])\r\n quantile_25[i] <- quantile(tabl[ ,i], probs = c(0.25))\r\n quantile_50[i] <- quantile(tabl[ ,i], probs = c(0.50))\r\n quantile_75[i] <- quantile(tabl[ ,i], probs = c(0.75))\r\n}\r\ncor <- cor(tabl[ ,3:33])\r\ncov <- cov(tabl[ ,3:33])\r\nproc.time() - t1\r\n\r\nsum_npPeret <- sum(tabl[ ,3])\r\nmin_npPeret <- min(tabl[ ,3])\r\nmax_npPeret <- max(tabl[ ,3])\r\nmean_npPeret <- mean(tabl[ ,3])\r\nvar_npPeret <- var(tabl[ ,3])\r\nsd_npPeret <- sd(tabl[ ,3])\r\nquantile_npPeret <- quantile(tabl[ ,3],probs = c(0.25,0.5,0.75))\r\nmedian_npPeret <- median(tabl[ ,3])\r\n\r\nsum_difficulty <- sum(tabl[ ,5])\r\nmin_difficulty <- min(tabl[ ,5])\r\nmax_difficulty <- max(tabl[ ,5])\r\nmean_difficulty <- mean(tabl[ ,5])\r\nvar_difficulty <- var(tabl[ ,5])\r\nsd_difficulty <- sd(tabl[ ,5])\r\nquantile_difficulty <- quantile(tabl[ ,5],probs = c(0.25,0.5,0.75))\r\nmedian_difficulty <- median(tabl[ ,5])\r\n\r\nsum_Q15 <- sum(tabl[ ,20])\r\nmin_Q15 <- min(tabl[ ,20])\r\nmax_Q15 <- max(tabl[ ,20])\r\nmean_Q15 <- mean(tabl[ ,20])\r\nvar_Q15 <- var(tabl[ ,20])\r\nsd_Q15 <- sd(tabl[ ,20])\r\nquantile_Q15 <- quantile(tabl[ ,20],probs = c(0.25,0.5,0.75))\r\nmedian_Q15 <- median(tabl[ ,20])"
## [1] "t1 <- proc.time()\r\n\r\ndata <- read.csv(\"dataset.csv\")\r\ndata <- data[data$class == 13, ] # load data, only 13 class\r\ndata <- data[data$attendance != 0, ]\r\ndata <- data[!duplicated(data[6:32]),]\r\n#data <- unique(data) # unique\r\n\r\n#difficulty of 2,3 instr\r\ni <- data[data$instr == 2, ]\r\nmeans <- colMeans(i) # means of all columns\r\ndeviations <- colSds(as.matrix(i)) # standard deviations\r\n\r\ni <- data[data$instr == 3, ]\r\nmeans <- colMeans(i) # means of all columns\r\ndeviations <- colSds(as.matrix(i)) # standard deviations\r\n\r\nsums <- colSums(data[1:33]) # sums of all columns\r\nmeans <- colMeans(data) # means of all columns\r\nmedians <- colMedians(as.matrix(data[1:33]))# medians\r\ndeviations <- colSds(as.matrix(data[1:33])) # standard deviations\r\ndispersions <- deviations*deviations # dispersions\r\nquantiles <- colQuantiles(as.matrix(data[1:33])) #quantilies\r\nmins <- colMins(as.matrix(data[1:33])) # min of each column\r\nmaxs <- colMaxs(as.matrix(data[1:33])) # max of each column\r\ncor <- cor(data[6:33]) # variance covariance matrix\r\ndet(cov(data[6:33]))\r\n\r\nproc.time() - t1\r\n\r\n#graphic\r\nbarplot(data.frame(table(i$difficulty))[,2],names.arg = \"Difficulty\",col = \"blue\")\r\nbarplot(data.frame(table(i$difficulty))[,2],names.arg = \"Difficulty\",col = \"blue\")\r\nbarplot(sums,col = \"blue\")\r\nbarplot(medians,col = \"blue\")\r\nbarplot(maxs,col=\"blue\")\r\nbarplot(quantiles[,1],col = \"blue\")\r\nbarplot(quantiles[,2],col = \"blue\")\r\nbarplot(quantiles[,3],col = \"blue\")\r\nbarplot(as.vector(table(as.vector(as.matrix(data[,6:33])))),col=\"blue\")\r\nboxplot(data[6:32],col = \"blue\")\r\nplot(sort(rowSums(data[6:33])),col=\"blue\",type = \"s\")\r\n\r\nproc.time() - t1"
## [1] "bartels_seno<-c();\r\nbartels_seno_statistic<-c();\r\nfor (k in 1:nrow(mC)) {\r\n bartels_seno[k]<-bartels.rank.test(mC[k,], alternative = \"two.sided\")\r\n bartels_seno_statistic[k]<-bartels_seno[k]$statistic\r\n}\r\n\r\n> for (k in 1:nrow(mC)) {\r\n+ bartels_seno[k]<-bartels.rank.test(mC[k,], alternative = \"two.sided\")\r\n+ bartels_seno_statistic[k]<-bartels_seno[k]$statistic\r\n+ }\r\nThere were 50 or more warnings (use warnings() to see the first 50)\r\n> warnings()\r\nWarning messages:\r\n1: In bartels_seno[k] <- bartels.rank.test(mC[k, ], alternative = \"two.sided\") :\r\n número de items para para sustituir no es un múltiplo de la longitud del reemplazo\r\n2: In bartels_seno[k] <- bartels.rank.test(mC[k, ], alternative = \"two.sided\") :\r\n número de items para para sustituir no es un múltiplo de la longitud del reemplazo\r\n3: In bartels_seno[k] <- bartels.rank.test(mC[k, ], alternative = \"two.sided\") :\r\n número de items para para sustituir no es un múltiplo de la longitud del reemplazo\r\n.\r\n.\r\n.\r\n50\r\n# Pruebas para chequear lo que hace\r\n> bartels_seno[1]$statistic\r\nNULL\r\n> bartels_seno[1]\r\n[[1]]\r\nstatistic\r\n-31.62401\r\n \r\n> bartels_seno[2]\r\n[[1]]\r\nstatistic\r\n-31.61939\r\n \r\n> length(bartels_seno)\r\n[1] 101"
## [1] "bartels_seno<-c();\r\nbartels_seno_statistic<-c();\r\nfor (k in 1:nrow(mC)) {\r\n bartels_seno[k]<-bartels.rank.test(mC[k,], alternative = \"two.sided\")\r\n bartels_seno_statistic[k]<-bartels_seno[k]$statistic\r\n}\r\n\r\nNULL\r\n> bartels_seno[1]\r\n[[1]]\r\nstatistic \r\n-31.62401 \r\n\r\n> bartels_seno[2]\r\n[[1]]\r\nstatistic \r\n-31.61939 \r\n\r\n> length(bartels_seno)\r\n[1] 101"
## [1] "Metro Systems Over Time: Part 3 - 01/02/2017\r\nhttps://datascienceplus.com/metro-systems-over-time-part-3/"
## [1] "dim(borboletas)[1] != dim(na.omit(borboletas))[1]"
## [1] "#Exercise 10.1\r\n####Subset the training set into two sets: class0 where Purchased = 0 and class1 where Purchased = 1).\r\n```{r exercice 10.1}\r\nclass0 = subset(training_set, training_set$Purchased==0)\r\nclass1 = subset(training_set, training_set$Purchased==1)\r\n```\r\n\r\n#Exercise 10.2\r\n####Compute pi0 and pi1\r\n```{r exercice 10.2}\r\npi0 = length(class0[[1]])/length(training_set[[1]])\r\npi1 = length(class1[[1]])/length(training_set[[1]])\r\n```\r\n\r\n#Exercise 10.3\r\n####Compute mu0 and mu1\r\n```{r exercice 10.3}\r\nmu0 = c(mean(class0$Age), mean(class0$EstimatedSalary))\r\nmu0\r\nmu1 = c(mean(class1$Age), mean(class1$EstimatedSalary))\r\nmu1\r\n```\r\n\r\n#Exercise 10.4\r\n####Compute Sum(sigma)\r\n```{r exercice 10.4}\r\nsigma=((nrow(class0)-1)*cov(class0[-3])+(nrow(class1)-1)*cov(class1[-3]))/(nrow(class0)+nrow(class1)-2)\r\nsigma\r\n```\r\n\r\n#Exercise 10.5\r\n####Now that we have computed all the needed estimates, we can calculate delta0 and delta1 for any observation x . And we will attribute x to the class with the highest delta . First, try it for x where xT=(1,1.5)xT=(1,1.5) , what is class prediction for this spesific xx ?\r\n```{r exercice 10.5}\r\nx=c(1,1.5)\r\n\r\ndelta0 = x%*%solve(sigma)%*%mu0-((0.5)%*%t(mu0)%*%solve(sigma)%*%mu0)+log(pi0)\r\ndelta1 = x%*%solve(sigma)%*%mu1-((0.5)%*%t(mu1)%*%solve(sigma)%*%mu1)+log(pi1)\r\n\r\ndelta0\r\ndelta1\r\n\r\n\r\nlibrary(MASS)\r\nclass1.lda <- lda(class1$Purchased~class1$Age+class1$EstimatedSalary, data=class1)\r\n\r\npredict(class1,newdata=x, type=\"response\")\r\n```\r\n\r\n#Exercise 10.6\r\n####\r\n```{r exercice 10.6}\r\npi = length(test_set[[1]])/length(test_set[[1]])\r\n\r\nmu = c(mean(test_set$Age), mean(test_set$EstimatedSalary))\r\n\r\nsigma2=((nrow(test_set)-1)*cov(test_set[-3])+(nrow(test_set)-1)*cov(test_set[-3]))/(nrow(test_set)+nrow(test_set)-2)\r\n\r\nsigma2\r\n\r\ndelta = x%*%solve(sigma2)%*%mu-((0.5)%*%t(mu)%*%solve(sigma2)%*%mu)+log(pi)\r\n\r\n```"
## [1] "#This project was done as a team with Tarun DeviReddy. \r\n #Referred to Yibo script (https://www.kaggle.com/yibochen/talkingdata-mobile-user-demographics/xgboost-in-r-2-27217)\r\n\r\n#Using Extreme Gradient Boosting. Here our aim is have device_id and the corresponding feature_name and rbind all such dataframes and convert it into a matrix to feed to the xgb function. \r\n\r\n#Grouping location: Replacing the (0,0) with means corresponding to a device id and making factors from continuous latitude and longitudes. This part can be optimized and made shorter.\r\nevents = fread(\"~/Downloads/TalkingData/events.csv\") %>% as.data.frame()\r\nevents_1 = subset(events, (events$longitude>1 | events$longitude< -1) | (events$latitude>1 | events$latitude< -1) )\r\n\r\nsplitmean_log <- function(df) {\r\n s <- split( df, df$device_id)\r\n sapply( s, function(x) mean(x$longitude) )\r\n \r\n}\r\nsplitmean_lat <- function(df) {\r\n s <- split( df, df$device_id)\r\n sapply( s, function(x) mean(x$latitude) )\r\n \r\n}\r\n\r\nmean_long = as.data.frame(splitmean_log(events_1))\r\nmean_long$device_id = rownames(mean_long)\r\nrownames(mean_long) <- NULL\r\ncolnames(mean_long) = c(\"longitude\", \"device_id\")\r\n\r\nmean_lat = as.data.frame(splitmean_lat(events_1))\r\nmean_lat$device_id = rownames(mean_lat)\r\nrownames(mean_lat)<-NULL\r\ncolnames(mean_lat) = c(\"latitude\",\"device_id\")\r\n\r\nlocation = data.frame(device_id=mean_lat$device_id, longitude = mean_long$longitude, \r\n latitude = mean_lat$latitude)\r\nrm(events,events_1, mean_lat, mean_long)\r\n\r\nlocation$longitude_n = ifelse((location$longitude>75 & location$longitude<135 & location$latitude>15 & location$latitude<55), location$longitude,\"Outside_China\")\r\nlocation$latitude_n = ifelse((location$longitude>75 & location$longitude<135 & location$latitude>15 & location$latitude<55), location$latitude,\"Outside_China\")\r\n\r\nlocation_new = subset(location, location$longitude_n!=\"Outside_China\")\r\n\r\nlocation_new$longgrp <- cut(location_new$longitude, \r\n breaks = c(-Inf, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, Inf), \r\n labels = c(\"long1\", \"long2\", \"long3\", \"long4\", \"long5\",\"long6\", \"long7\", \"long8\", \"long9\", \"long10\", \"long11\", \"long12\"), \r\n right = FALSE)\r\n\r\nlocation_new$latgrp <- cut(location_new$latitude, \r\n breaks = c(-Inf, 15, 17.5, 20, 22.5, 25, 27.5, 30, 32.5, 35, 37.5, 40, 42.5, 45, 47.5, 50, 52.5, Inf), \r\n labels = c(\"lat1\", \"lat2\", \"lat3\", \"lat4\", \"lat5\",\"lat6\", \"lat7\", \"lat8\", \"lat9\", \"lat10\", \"lat11\", \"lat12\", \"lat13\", \"lat14\", \"lat15\",\"lat16\",\"lat17\"), \r\n right = FALSE)\r\nlocation_new$longitude<- location_new$latitude <- location_new$longitude_n <- location_new$latitude_n <- NULL\r\ncolnames(location_new) = c(\"device_id\", \"longitude\", \"latitude\")\r\nlocation_outside = subset(location, location$longitude_n==\"Outside_China\")\r\nlocation_outside$longitude <- location_outside$latitude <- NULL\r\ncolnames(location_outside) = c(\"device_id\", \"longitude\", \"latitude\")\r\nlocation = rbind(location_outside, location_new)\r\nrownames(location) <- NULL\r\nlocation$device_id = as.character(location$device_id)\r\ndevice_loc_long = data.frame(device_id = location$device_id, feature_name = location$longitude)\r\ndevice_loc_lat = data.frame(device_id = location$device_id, feature_name = location$latitude)\r\nrm(location_outside, location_new, location)\r\n\r\ndevice_loc_lat$device_id = as.character(device_loc_lat$device_id)\r\ndevice_loc_long$device_id = as.character(device_loc_long$device_id)\r\n\r\ndevice_loc_lat$feature_name = as.character(device_loc_lat$feature_name)\r\ndevice_loc_long$feature_name = as.character(device_loc_long$feature_name)\r\n\r\n\r\n#make time_avg, time_mdn: Creating the average number of events per day and the most and least used duration in a day\r\n\r\nevents = fread(\"~/Downloads/TalkingData/events.csv\") %>% as.data.frame()\r\nevents$date = as.Date(sapply(strsplit(events$timestamp, split = \" \"), head, 1))\r\nevents$time = as.numeric(substr(events$timestamp, 12,13))\r\nevents$Morning <- 0\r\nevents$Day <- 0\r\nevents$Night <- 0\r\n\r\nevents$Morning[events$time>4 & events$time<10] = 1\r\nevents$Day[events$time>=10 & events$time<21] = 1\r\nevents$Night = as.numeric(!(events$Morning | events$Day))\r\n\r\nevents$device_id = as.character(events$device_id)\r\ntmp1 = as.data.frame(sapply(split(events$Morning,events$device_id), sum))\r\ntmp1$device_id = as.character(rownames(tmp1))\r\nrownames(tmp1) <- NULL\r\ncolnames(tmp1) = c(\"Morning\", \"device_id\")\r\ntmp2 = as.data.frame(sapply(split(events$Day,events$device_id), sum))\r\ntmp2$device_id = as.character(rownames(tmp2))\r\nrownames(tmp2) <- NULL\r\ncolnames(tmp2) = c(\"Day\", \"device_id\")\r\ntmp3 = as.data.frame(sapply(split(events$Night,events$device_id), sum))\r\ntmp3$device_id = as.character(rownames(tmp3))\r\nrownames(tmp3) <- NULL\r\ncolnames(tmp3) = c(\"Night\", \"device_id\")\r\nevents$Morning<-events$Day<-events$Night<-NULL\r\n\r\ntime = merge(events, tmp1, by = \"device_id\", all.x=T)\r\ntime1 = merge(time, tmp2, by = \"device_id\", all.x=T)\r\ntime2 = merge(time1, tmp3, by = \"device_id\", all.x=T)\r\ntime = time2\r\nrm(time1, time2)\r\nrm(tmp1, tmp2, tmp3)\r\ntime$total = time$Morning+time$Day+time$Night\r\ntime$Morning = time$Morning/time$total\r\ntime$Day = time$Day/time$total\r\ntime$Night = time$Night/time$total\r\n\r\ntmp = as.data.frame(sapply(split(events$date,events$device_id), function(x) length(unique(x))))\r\ntmp$device_id = as.character(rownames(tmp))\r\ntmp$noday = tmp$`sapply(split(events$date, events$device_id), function(x) length(unique(x)))`\r\ntmp$`sapply(split(events$date, events$device_id), function(x) length(unique(x)))` <-NULL\r\nrownames(tmp) <- NULL\r\n\r\ntime[,2:7]=NULL\r\n\r\ntime_final = unique(time)\r\n\r\ntime_final = merge(time_final, tmp, by = \"device_id\", all.x = T)\r\n\r\ntime_final$avgevt = (time_final$total / time_final$noday)\r\ntime_final$MDN = paste(colnames(time_final[,2:4])[apply(time_final[,2:4],1,which.max)],\r\n colnames(time_final[,2:4])[apply(time_final[,2:4],1,which.min)],sep=\"-\")\r\n\r\ntime_final$avg = ifelse(time_final$avgevt<2, \"Low\",\r\n ifelse(time_final$avgevt<10, \"Med\", \"High\") )\r\n\r\ntime_mdn = time_final[,c(1,8)]\r\ncolnames(time_mdn) = c(\"device_id\", \"feature_name\")\r\ntime_avg = time_final[,c(1,9)]\r\ncolnames(time_avg) = c(\"device_id\", \"feature_name\")\r\nrm(events, time, time_final, tmp)\r\n\r\n#Creating the data for model and brand\r\n\r\nlabel_train <- fread(\"~/Downloads/TalkingData/gender_age_train.csv\",\r\n colClasses=c(\"character\",\"character\",\r\n \"integer\",\"character\"))\r\nlabel_test <- fread(\"~/Downloads/TalkingData/gender_age_test.csv\",\r\n colClasses=c(\"character\"))\r\nlabel_test$gender <- label_test$age <- label_test$group <- NA\r\nlabel <- rbind(label_train,label_test)\r\nsetkey(label,device_id)\r\nrm(label_test,label_train);gc()\r\n\r\nbrand = fread(\"~/Downloads/TalkingData/phone_brand_device_model.csv\",\r\n colClasses=c(\"character\",\"character\",\"character\")) \r\n\r\n\r\nsetkey(brand,device_id)\r\nbrand2 = unique(brand)\r\nlabel1 <- merge(label,brand2,by=\"device_id\",all.x=T)\r\n\r\nrm(brand, brand2);gc()\r\n\r\nevents = fread(\"~/Downloads/TalkingData/events.csv\", colClasses=c(\"character\",\"character\",\"character\",\r\n \"numeric\",\"numeric\"))\r\nsetkeyv(events,c(\"device_id\",\"event_id\"))\r\nevent_app <- fread(\"~/Downloads/TalkingData/app_events.csv\",\r\n colClasses=rep(\"character\",4))\r\nsetkey(event_app,event_id)\r\n\r\n#events <- unique(events[,list(device_id,event_id)],by=NULL)\r\n\r\n#list of apps corresponding to each event\r\nevent_apps <- event_app[,list(apps=paste(unique(app_id),collapse=\",\")),by=\"event_id\"]\r\ndevice_event_apps <- merge(events,event_apps,by=\"event_id\")\r\nrm(events,event_app,event_apps);gc()\r\n\r\nf_split_paste <- function(z){paste(unique(unlist(strsplit(z,\",\"))),collapse=\",\")}\r\ndevice_apps <- device_event_apps[,list(apps=f_split_paste(apps)),by=\"device_id\"]\r\nrm(device_event_apps,f_split_paste);gc()\r\n\r\n\r\ntmp <- strsplit(device_apps$apps,\",\")\r\ndevice_apps <- data.table(device_id=rep(device_apps$device_id,\r\n times=sapply(tmp,length)),\r\n app_id=unlist(tmp))\r\nrm(tmp)\r\n\r\n#Introducing all the app categories, no groupings done\r\n\r\n#make device_cat\r\n\r\napp_labels = fread(\"~/Downloads/TalkingData/app_labels.csv\") %>% as.data.frame()\r\nlabel_categories = fread(\"~/Downloads/TalkingData/label_categories.csv\") %>% as.data.frame()\r\n\r\napp_cat = merge(app_labels, label_categories, by=\"label_id\", all.x= T)\r\nuni_appcat = unique(app_cat[,c(1,3)])\r\nuni_appcat$category = as.character(uni_appcat$category)\r\nuni_appcat$category = tolower(uni_appcat$category)\r\n#uni_appcat$cat = ifelse(grepl(\"gam|war|rac|mmo|dota|play|ball|chess|fight|tennis|billard|puzz|poker|sport|shoot|rpg\", uni_appcat$category), \"game\",uni_appcat$category)\r\n\r\napp = merge(app_labels,uni_appcat,by=\"label_id\",all.x = T)\r\napp = na.omit(app)\r\napp = app[,c(2,3)]\r\ncolnames(app) = c(\"app_id\", \"feature_name\")\r\napp$app_id = as.character(app$app_id)\r\napp = unique(app)\r\nrm(uni_appcat,app_labels,label_categories, app_cat)\r\ndevice_cat = merge(device_apps, app, by = \"app_id\", all.x = T, allow.cartesian = T)\r\ndevice_cat$app_id<-NULL\r\ndevice_cat = unique(device_cat)\r\ndevice_cat = na.omit(device_cat)\r\nrownames(device_cat)<-NULL\r\nrm(app)\r\n\r\n#xgb: merging all the data tables together and making the matrix in xgb format \r\n\r\n\r\n\r\nd1 <- label1[,list(device_id,phone_brand)]\r\nlabel1$phone_brand <- NULL\r\nd2 <- label1[,list(device_id,device_model)]\r\nlabel1$device_model <- NULL\r\nd3 <- device_apps\r\nrm(device_apps)\r\nd1[,phone_brand:=paste0(\"phone_brand:\",phone_brand)]\r\nd2[,device_model:=paste0(\"device_model:\",device_model)]\r\nd3[,app_id:=paste0(\"app_id:\",app_id)]\r\nnames(d1) <- names(d2) <- names(d3) <- c(\"device_id\",\"feature_name\")\r\ndd <- rbind(d1,d2,d3)\r\ndd = rbind(dd, time_mdn, device_cat, device_loc_lat, device_loc_long)\r\nrow.names(dd) <- NULL\r\n\r\nrm(d1,d2,d3);gc()\r\nrequire(Matrix)\r\nii <- unique(dd$device_id)\r\njj <- unique(dd$feature_name)\r\nid_i <- match(dd$device_id,ii)\r\nid_j <- match(dd$feature_name,jj)\r\nid_ij <- cbind(id_i,id_j)\r\nM <- Matrix(0,nrow=length(ii),ncol=length(jj),\r\n dimnames=list(ii,jj),sparse=T)\r\n\r\nM[id_ij] <- 1\r\nrm(ii,jj,id_i,id_j,id_ij,dd);gc()\r\n\r\nx <- M[rownames(M) %in% label1$device_id,]\r\nid <- label1$device_id[match(rownames(x),label1$device_id)]\r\ny <- label1$group[match(rownames(x),label1$device_id)]\r\nrm(M,label1)\r\n\r\n# level reduction\r\nx_train <- x[!is.na(y),]\r\ntmp_cnt_train <- colSums(x_train)\r\nx <- x[,tmp_cnt_train>0 & tmp_cnt_train\r\n\r\nrequire(xgboost)\r\n(group_name <- na.omit(unique(y)))\r\nidx_train <- which(!is.na(y))\r\nidx_test <- which(is.na(y))\r\ntrain_data <- x[idx_train,]\r\ntest_data <- x[idx_test,]\r\ntrain_label <- match(y[idx_train],group_name)-1\r\ntest_label <- match(y[idx_test],group_name)-1\r\ndtrain <- xgb.DMatrix(train_data,label=train_label,missing=NA)\r\ndtest <- xgb.DMatrix(test_data,label=test_label,missing=NA)\r\n\r\nparam <- list(booster=\"gblinear\",\r\n num_class=length(group_name),\r\n objective=\"multi:softprob\",\r\n eval_metric=\"mlogloss\",\r\n eta=0.01,\r\n lambda=5,\r\n lambda_bias=0,\r\n alpha=2)\r\nwatchlist <- list(train=dtrain)\r\nset.seed(114)\r\nfit_cv <- xgb.cv(params=param,\r\n data=dtrain,\r\n nrounds=100000,\r\n watchlist=watchlist,\r\n nfold=5,\r\n early.stop.round=3,\r\n verbose=1)\r\n\r\nntree <- 250 # the value obtained from CV\r\nset.seed(114)\r\nfit_xgb <- xgb.train(params=param,\r\n data=dtrain,\r\n nrounds=ntree,\r\n watchlist=watchlist,\r\n verbose=1)\r\npred <- predict(fit_xgb,dtest)\r\npred_detail <- t(matrix(pred,nrow=length(group_name)))\r\nres_submit <- cbind(id=id[idx_test],as.data.frame(pred_detail))\r\ncolnames(res_submit) <- c(\"device_id\",group_name)\r\nwrite.csv(res_submit,file=\"submit_ultimate1.csv\",row.names=F,quote=F)"
## [1] "#This project was done as a team with Tarun DeviReddy. \r\n #Referred to Yibo script (https://www.kaggle.com/yibochen/talkingdata-mobile-user-demographics/xgboost-in-r-2-27217)\r\n\r\n#Using Extreme Gradient Boosting. Here our aim is have device_id and the corresponding feature_name and rbind all such dataframes and convert it into a matrix to feed to the xgb function. \r\n\r\n#Grouping location: Replacing the (0,0) with means corresponding to a device id and making factors from continuous latitude and longitudes. This part can be optimized and made shorter.\r\nevents = fread(\"~/Downloads/TalkingData/events.csv\") %>% as.data.frame()\r\nevents_1 = subset(events, (events$longitude>1 | events$longitude< -1) | (events$latitude>1 | events$latitude< -1) )\r\n\r\nsplitmean_log <- function(df) {\r\n s <- split( df, df$device_id)\r\n sapply( s, function(x) mean(x$longitude) )\r\n \r\n}\r\nsplitmean_lat <- function(df) {\r\n s <- split( df, df$device_id)\r\n sapply( s, function(x) mean(x$latitude) )\r\n \r\n}\r\n\r\nmean_long = as.data.frame(splitmean_log(events_1))\r\nmean_long$device_id = rownames(mean_long)\r\nrownames(mean_long) <- NULL\r\ncolnames(mean_long) = c(\"longitude\", \"device_id\")\r\n\r\nmean_lat = as.data.frame(splitmean_lat(events_1))\r\nmean_lat$device_id = rownames(mean_lat)\r\nrownames(mean_lat)<-NULL\r\ncolnames(mean_lat) = c(\"latitude\",\"device_id\")\r\n\r\nlocation = data.frame(device_id=mean_lat$device_id, longitude = mean_long$longitude, \r\n latitude = mean_lat$latitude)\r\nrm(events,events_1, mean_lat, mean_long)\r\n\r\nlocation$longitude_n = ifelse((location$longitude>75 & location$longitude<135 & location$latitude>15 & location$latitude<55), location$longitude,\"Outside_China\")\r\nlocation$latitude_n = ifelse((location$longitude>75 & location$longitude<135 & location$latitude>15 & location$latitude<55), location$latitude,\"Outside_China\")\r\n\r\nlocation_new = subset(location, location$longitude_n!=\"Outside_China\")\r\n\r\nlocation_new$longgrp <- cut(location_new$longitude, \r\n breaks = c(-Inf, 80, 85, 90, 95, 100, 105, 110, 115, 120, 125, 130, Inf), \r\n labels = c(\"long1\", \"long2\", \"long3\", \"long4\", \"long5\",\"long6\", \"long7\", \"long8\", \"long9\", \"long10\", \"long11\", \"long12\"), \r\n right = FALSE)\r\n\r\nlocation_new$latgrp <- cut(location_new$latitude, \r\n breaks = c(-Inf, 15, 17.5, 20, 22.5, 25, 27.5, 30, 32.5, 35, 37.5, 40, 42.5, 45, 47.5, 50, 52.5, Inf), \r\n labels = c(\"lat1\", \"lat2\", \"lat3\", \"lat4\", \"lat5\",\"lat6\", \"lat7\", \"lat8\", \"lat9\", \"lat10\", \"lat11\", \"lat12\", \"lat13\", \"lat14\", \"lat15\",\"lat16\",\"lat17\"), \r\n right = FALSE)\r\nlocation_new$longitude<- location_new$latitude <- location_new$longitude_n <- location_new$latitude_n <- NULL\r\ncolnames(location_new) = c(\"device_id\", \"longitude\", \"latitude\")\r\nlocation_outside = subset(location, location$longitude_n==\"Outside_China\")\r\nlocation_outside$longitude <- location_outside$latitude <- NULL\r\ncolnames(location_outside) = c(\"device_id\", \"longitude\", \"latitude\")\r\nlocation = rbind(location_outside, location_new)\r\nrownames(location) <- NULL\r\nlocation$device_id = as.character(location$device_id)\r\ndevice_loc_long = data.frame(device_id = location$device_id, feature_name = location$longitude)\r\ndevice_loc_lat = data.frame(device_id = location$device_id, feature_name = location$latitude)\r\nrm(location_outside, location_new, location)\r\n\r\ndevice_loc_lat$device_id = as.character(device_loc_lat$device_id)\r\ndevice_loc_long$device_id = as.character(device_loc_long$device_id)\r\n\r\ndevice_loc_lat$feature_name = as.character(device_loc_lat$feature_name)\r\ndevice_loc_long$feature_name = as.character(device_loc_long$feature_name)\r\n\r\n\r\n#make time_avg, time_mdn: Creating the average number of events per day and the most and least used duration in a day\r\n\r\nevents = fread(\"~/Downloads/TalkingData/events.csv\") %>% as.data.frame()\r\nevents$date = as.Date(sapply(strsplit(events$timestamp, split = \" \"), head, 1))\r\nevents$time = as.numeric(substr(events$timestamp, 12,13))\r\nevents$Morning <- 0\r\nevents$Day <- 0\r\nevents$Night <- 0\r\n\r\nevents$Morning[events$time>4 & events$time<10] = 1\r\nevents$Day[events$time>=10 & events$time<21] = 1\r\nevents$Night = as.numeric(!(events$Morning | events$Day))\r\n\r\nevents$device_id = as.character(events$device_id)\r\ntmp1 = as.data.frame(sapply(split(events$Morning,events$device_id), sum))\r\ntmp1$device_id = as.character(rownames(tmp1))\r\nrownames(tmp1) <- NULL\r\ncolnames(tmp1) = c(\"Morning\", \"device_id\")\r\ntmp2 = as.data.frame(sapply(split(events$Day,events$device_id), sum))\r\ntmp2$device_id = as.character(rownames(tmp2))\r\nrownames(tmp2) <- NULL\r\ncolnames(tmp2) = c(\"Day\", \"device_id\")\r\ntmp3 = as.data.frame(sapply(split(events$Night,events$device_id), sum))\r\ntmp3$device_id = as.character(rownames(tmp3))\r\nrownames(tmp3) <- NULL\r\ncolnames(tmp3) = c(\"Night\", \"device_id\")\r\nevents$Morning<-events$Day<-events$Night<-NULL\r\n\r\ntime = merge(events, tmp1, by = \"device_id\", all.x=T)\r\ntime1 = merge(time, tmp2, by = \"device_id\", all.x=T)\r\ntime2 = merge(time1, tmp3, by = \"device_id\", all.x=T)\r\ntime = time2\r\nrm(time1, time2)\r\nrm(tmp1, tmp2, tmp3)\r\ntime$total = time$Morning+time$Day+time$Night\r\ntime$Morning = time$Morning/time$total\r\ntime$Day = time$Day/time$total\r\ntime$Night = time$Night/time$total\r\n\r\ntmp = as.data.frame(sapply(split(events$date,events$device_id), function(x) length(unique(x))))\r\ntmp$device_id = as.character(rownames(tmp))\r\ntmp$noday = tmp$`sapply(split(events$date, events$device_id), function(x) length(unique(x)))`\r\ntmp$`sapply(split(events$date, events$device_id), function(x) length(unique(x)))` <-NULL\r\nrownames(tmp) <- NULL\r\n\r\ntime[,2:7]=NULL\r\n\r\ntime_final = unique(time)\r\n\r\ntime_final = merge(time_final, tmp, by = \"device_id\", all.x = T)\r\n\r\ntime_final$avgevt = (time_final$total / time_final$noday)\r\ntime_final$MDN = paste(colnames(time_final[,2:4])[apply(time_final[,2:4],1,which.max)],\r\n colnames(time_final[,2:4])[apply(time_final[,2:4],1,which.min)],sep=\"-\")\r\n\r\ntime_final$avg = ifelse(time_final$avgevt<2, \"Low\",\r\n ifelse(time_final$avgevt<10, \"Med\", \"High\") )\r\n\r\ntime_mdn = time_final[,c(1,8)]\r\ncolnames(time_mdn) = c(\"device_id\", \"feature_name\")\r\ntime_avg = time_final[,c(1,9)]\r\ncolnames(time_avg) = c(\"device_id\", \"feature_name\")\r\nrm(events, time, time_final, tmp)\r\n\r\n#Creating the data for model and brand\r\n\r\nlabel_train <- fread(\"~/Downloads/TalkingData/gender_age_train.csv\",\r\n colClasses=c(\"character\",\"character\",\r\n \"integer\",\"character\"))\r\nlabel_test <- fread(\"~/Downloads/TalkingData/gender_age_test.csv\",\r\n colClasses=c(\"character\"))\r\nlabel_test$gender <- label_test$age <- label_test$group <- NA\r\nlabel <- rbind(label_train,label_test)\r\nsetkey(label,device_id)\r\nrm(label_test,label_train);gc()\r\n\r\nbrand = fread(\"~/Downloads/TalkingData/phone_brand_device_model.csv\",\r\n colClasses=c(\"character\",\"character\",\"character\")) \r\n\r\n\r\nsetkey(brand,device_id)\r\nbrand2 = unique(brand)\r\nlabel1 <- merge(label,brand2,by=\"device_id\",all.x=T)\r\n\r\nrm(brand, brand2);gc()\r\n\r\nevents = fread(\"~/Downloads/TalkingData/events.csv\", colClasses=c(\"character\",\"character\",\"character\",\r\n \"numeric\",\"numeric\"))\r\nsetkeyv(events,c(\"device_id\",\"event_id\"))\r\nevent_app <- fread(\"~/Downloads/TalkingData/app_events.csv\",\r\n colClasses=rep(\"character\",4))\r\nsetkey(event_app,event_id)\r\n\r\n#events <- unique(events[,list(device_id,event_id)],by=NULL)\r\n\r\n#list of apps corresponding to each event\r\nevent_apps <- event_app[,list(apps=paste(unique(app_id),collapse=\",\")),by=\"event_id\"]\r\ndevice_event_apps <- merge(events,event_apps,by=\"event_id\")\r\nrm(events,event_app,event_apps);gc()\r\n\r\nf_split_paste <- function(z){paste(unique(unlist(strsplit(z,\",\"))),collapse=\",\")}\r\ndevice_apps <- device_event_apps[,list(apps=f_split_paste(apps)),by=\"device_id\"]\r\nrm(device_event_apps,f_split_paste);gc()\r\n\r\n\r\ntmp <- strsplit(device_apps$apps,\",\")\r\ndevice_apps <- data.table(device_id=rep(device_apps$device_id,\r\n times=sapply(tmp,length)),\r\n app_id=unlist(tmp))\r\nrm(tmp)\r\n\r\n#Introducing all the app categories, no groupings done\r\n\r\n#make device_cat\r\n\r\napp_labels = fread(\"~/Downloads/TalkingData/app_labels.csv\") %>% as.data.frame()\r\nlabel_categories = fread(\"~/Downloads/TalkingData/label_categories.csv\") %>% as.data.frame()\r\n\r\napp_cat = merge(app_labels, label_categories, by=\"label_id\", all.x= T)\r\nuni_appcat = unique(app_cat[,c(1,3)])\r\nuni_appcat$category = as.character(uni_appcat$category)\r\nuni_appcat$category = tolower(uni_appcat$category)\r\n#uni_appcat$cat = ifelse(grepl(\"gam|war|rac|mmo|dota|play|ball|chess|fight|tennis|billard|puzz|poker|sport|shoot|rpg\", uni_appcat$category), \"game\",uni_appcat$category)\r\n\r\napp = merge(app_labels,uni_appcat,by=\"label_id\",all.x = T)\r\napp = na.omit(app)\r\napp = app[,c(2,3)]\r\ncolnames(app) = c(\"app_id\", \"feature_name\")\r\napp$app_id = as.character(app$app_id)\r\napp = unique(app)\r\nrm(uni_appcat,app_labels,label_categories, app_cat)\r\ndevice_cat = merge(device_apps, app, by = \"app_id\", all.x = T, allow.cartesian = T)\r\ndevice_cat$app_id<-NULL\r\ndevice_cat = unique(device_cat)\r\ndevice_cat = na.omit(device_cat)\r\nrownames(device_cat)<-NULL\r\nrm(app)\r\n\r\n#xgb: merging all the data tables together and making the matrix in xgb format \r\n\r\n\r\n\r\nd1 <- label1[,list(device_id,phone_brand)]\r\nlabel1$phone_brand <- NULL\r\nd2 <- label1[,list(device_id,device_model)]\r\nlabel1$device_model <- NULL\r\nd3 <- device_apps\r\nrm(device_apps)\r\nd1[,phone_brand:=paste0(\"phone_brand:\",phone_brand)]\r\nd2[,device_model:=paste0(\"device_model:\",device_model)]\r\nd3[,app_id:=paste0(\"app_id:\",app_id)]\r\nnames(d1) <- names(d2) <- names(d3) <- c(\"device_id\",\"feature_name\")\r\ndd <- rbind(d1,d2,d3)\r\ndd = rbind(dd, time_mdn, device_cat, device_loc_lat, device_loc_long)\r\nrow.names(dd) <- NULL\r\n\r\nrm(d1,d2,d3);gc()\r\nrequire(Matrix)\r\nii <- unique(dd$device_id)\r\njj <- unique(dd$feature_name)\r\nid_i <- match(dd$device_id,ii)\r\nid_j <- match(dd$feature_name,jj)\r\nid_ij <- cbind(id_i,id_j)\r\nM <- Matrix(0,nrow=length(ii),ncol=length(jj),\r\n dimnames=list(ii,jj),sparse=T)\r\n\r\nM[id_ij] <- 1\r\nrm(ii,jj,id_i,id_j,id_ij,dd);gc()\r\n\r\nx <- M[rownames(M) %in% label1$device_id,]\r\nid <- label1$device_id[match(rownames(x),label1$device_id)]\r\ny <- label1$group[match(rownames(x),label1$device_id)]\r\nrm(M,label1)\r\n\r\n# level reduction\r\nx_train <- x[!is.na(y),]\r\ntmp_cnt_train <- colSums(x_train)\r\nx <- x[,tmp_cnt_train>0 & tmp_cnt_train\r\n\r\nrequire(xgboost)\r\n(group_name <- na.omit(unique(y)))\r\nidx_train <- which(!is.na(y))\r\nidx_test <- which(is.na(y))\r\ntrain_data <- x[idx_train,]\r\ntest_data <- x[idx_test,]\r\ntrain_label <- match(y[idx_train],group_name)-1\r\ntest_label <- match(y[idx_test],group_name)-1\r\ndtrain <- xgb.DMatrix(train_data,label=train_label,missing=NA)\r\ndtest <- xgb.DMatrix(test_data,label=test_label,missing=NA)\r\n\r\nparam <- list(booster=\"gblinear\",\r\n num_class=length(group_name),\r\n objective=\"multi:softprob\",\r\n eval_metric=\"mlogloss\",\r\n eta=0.01,\r\n lambda=5,\r\n lambda_bias=0,\r\n alpha=2)\r\nwatchlist <- list(train=dtrain)\r\nset.seed(114)\r\nfit_cv <- xgb.cv(params=param,\r\n data=dtrain,\r\n nrounds=100000,\r\n watchlist=watchlist,\r\n nfold=5,\r\n early.stop.round=3,\r\n verbose=1)\r\n\r\nntree <- 250 # the value obtained from CV\r\nset.seed(114)\r\nfit_xgb <- xgb.train(params=param,\r\n data=dtrain,\r\n nrounds=ntree,\r\n watchlist=watchlist,\r\n verbose=1)\r\npred <- predict(fit_xgb,dtest)\r\npred_detail <- t(matrix(pred,nrow=length(group_name)))\r\nres_submit <- cbind(id=id[idx_test],as.data.frame(pred_detail))\r\ncolnames(res_submit) <- c(\"device_id\",group_name)\r\nwrite.csv(res_submit,file=\"submit_ultimate1.csv\",row.names=F,quote=F)"
## [1] "# El objetivo es ver cómo cambia el parámetro estadístico de un test al probar diferentes niveles de ruido mezclados en una señal. Suni me ha sugerido esta fórmula: S = (1-x)*A + x*Ruido, donde A es una señal que yo me he creado (sintética) -> un seno por ejemplo y Ruido es una señal de ruido blanco gaussiano.\r\n# Por ejemplo, una combinación podría ser: S1 = 0.9*A + 0.1*Ruido y después a esa señal S1, aplicarle un test de aleatoriedad cuyo valor estadístico del test debería ser bajo con respecto al resto, mientras que, en contra partida, el valor del test de aleatoriedad para S2 = 0.1*A + 0.9*Ruido debería ser muy alto.\r\n# Ahora bien, uno de los \"problemas\" es que S1, S2...Sn no son un único valor, son unas 'señales resultado' de 1000 muestras (todas las señales sintéticas creadas tienen 1000 muestras), por lo que no podemos trabajar con vectores, sino con dataframes.\r\n# Se me ocurre plantearlo de manera que cada una de mis filas equivale a 1 vector (el paso es de 0.01 para ser más precisos así que tendría 101 filas; y cada columna sería cada uno de los valores de mi 'señal resultado'\r\n# Al ejecutar esto me da error de longitudes, yo creo que es porque no está metiendo bien el vector en la matriz....entiendo que la línea dC[j,k]<-((1-i)*senoVect) + (noiseVect*i) está mal, pero no sé cómo hacerlo...yo no veo ahí posibilidad de que el término de la derecha dependa de j y k...o hay que meter alguna línea previa?\r\n \r\ndC<-data.frame();\r\nfor(k in 1:length(senoVect)) { #control de columna\r\n for(j in 1:length(seq(from=0, to=1, by=0.01))){ #control de fila\r\n for(i in seq(from=0, to=1, by=0.01)) { #control del coeficiente\r\n dC[j,k]<-((1-i)*senoVect) + (noiseVect*i)\r\n }\r\n }\r\n}\r\n# Una vez se tiene el data frame, se podría hacer o una función o un for, que recorra el data frame y haga el test a cada \"fila i\" y lo almacene en un vector de 101 muestras y, finalmente, representarlas y ver el comportamiento del índice estadístico.\r\n\r\n# Error\r\nError in `[<-.data.frame`(`*tmp*`, j, k, value = c(0, 0.0314107590781283, : \r\n replacement has 1000 rows, data has 1 \r\n\r\n# Workspace\r\nhttp://www.subirimagenes.com/imagen-capturadepantalla-9682023.html\r\n\r\n# hay variables que no uso pero que se cargan directamente al abrir el programa porque hay un workspace guardado...como puedes ver, el código apenas se ejecuta, no entra en el for de la variable i."
## [1] "getwd()\r\nsetwd(\"C:/Users/PC01/Desktop/R_ejemplos\")\r\nload(\"ModelosMCO.RData\")\r\n\r\nsummary(modelo)\r\nR2_adj=1-(1-R_2)*((n-1)/(n-p-1))\r\n\r\nVarBmco=as.numeric(sigma)*solve(t(x)%*%x)\r\nsdb1=sqrt(VarBmco[2,2])\r\ntc=Bmco[2]/sdb1\r\n\r\n#Prueba de significancia global del modelo\r\nq=p\r\nx1=x[,2:(p+1)]\r\nxc=(I(n)-(1/n)*J(n))%*%x1\r\nB1=Bmco[2:(p+1)]\r\nFc=(t(B1)%*%t(xc)%*%xc%*%B1)/(p*as.numeric(sigma))\r\npvalor=1-pf(Fc,p,n-p-1)\r\n\r\nR=matrix(0,nrow=p,ncol=p+1)\r\nR[,2:(p+1)]=I(p)"
## [1] "# El objetivo es ver cómo cambia el parámetro estadístico de un test al probar diferentes niveles de ruido mezclados en una señal. Suni me ha sugerido esta fórmula: S = (1-x)*A + x*Ruido, donde A es una señal que yo me he creado (sintética) -> un seno por ejemplo y Ruido es una señal de ruido blanco gaussiano.\r\n# Por ejemplo, una combinación podría ser: S1 = 0.9*A + 0.1*Ruido y después a esa señal S1, aplicarle un test de aleatoriedad cuyo valor estadístico del test debería ser bajo con respecto al resto, mientras que, en contra partida, el valor del test de aleatoriedad para S2 = 0.1*A + 0.9*Ruido debería ser muy alto.\r\n# Ahora bien, uno de los \"problemas\" es que S1, S2...Sn no son un único valor, son unas 'señales resultado' de 1000 muestras (todas las señales sintéticas creadas tienen 1000 muestras), por lo que no podemos trabajar con vectores, sino con dataframes.\r\n# Se me ocurre plantearlo de manera que cada una de mis filas equivale a 1 vector (el paso es de 0.01 para ser más precisos así que tendría 101 filas; y cada columna sería cada uno de los valores de mi 'señal resultado'\r\n# Al ejecutar esto me da error de longitudes, yo creo que es porque no está metiendo bien el vector en la matriz....entiendo que la línea dC[j,k]<-((1-i)*senoVect) + (noiseVect*i) está mal, pero no sé cómo hacerlo...yo no veo ahí posibilidad de que el término de la derecha dependa de j y k...o hay que meter alguna línea previa?\r\n\r\ndC<-data.frame();\r\nfor(k in 1:length(senoVect)) { #control de columna\r\n for(j in 1:length(seq(from=0, to=1, by=0.01))){ #control de fila\r\n for(i in seq(from=0, to=1, by=0.01)) { #control del coeficiente\r\n dC[j,k]<-((1-i)*senoVect) + (noiseVect*i)\r\n }\r\n }\r\n}\r\n# Una vez se tiene el data frame, se podría hacer o una función o un for, que recorra el data frame y haga el test a cada \"fila i\" y lo almacene en un vector de 101 muestras y, finalmente, representarlas y ver el comportamiento del índice estadístico."
## [1] "#############\r\n# Problem 2 #\r\n#############\r\nSpiderSpeeds <- c(1.25, 1.64, 1.91, 2.31, 2.37, 2.38, 2.84, 2.87, 2.93, 2.94, 2.98, 3.00, 3.09, 3.22, 3.41)\r\n\r\n# a)\r\nhist(SpiderSpeeds, main=\"Frequency of Spider Running Speeds\", xlab = 'Running Speed (cm/s)')\r\n\r\n# b)\r\nmean(SpiderSpeeds)\r\nsd(SpiderSpeeds)\r\n\r\n# c)\r\nsummary(SpiderSpeeds)\r\n\r\n# d)\r\nmax(SpiderSpeeds) - min(SpiderSpeeds)\r\nIQR(SpiderSpeeds)"
library(pastebin)
library(testthat)
date()
## [1] "Sun Feb 5 15:02:48 2017"
test_dir("tests/")
## testthat results ========================================================================================================
## OK: 0 SKIPPED: 0 FAILED: 0
##
## DONE ===================================================================================================================