forked from dataforgoodfr/croixrouge
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
7 changed files
with
419 additions
and
0 deletions.
There are no files selected for viewing
Binary file renamed
BIN
+11.7 MB
...DA agrégée/2015-04-03-db-aida_pri.sql.zip → R/RData/aida_agg.RData
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
require(data.table) | ||
|
||
|
||
### DIM_STR_DEPT | ||
#delete the empty DIM_STR_DEPT table | ||
aida_agg[["DIM_STR_DEPT"]] <- NULL | ||
|
||
### DIM_DATE | ||
# delete the following unnecessary rows in DIM_DATE | ||
# NUM_INV_MOIS | ||
# LB_MOIS | ||
# NUM_INV_TRIM | ||
# LB_TRIM | ||
# LIB_ANNEE | ||
aida_agg[["DIM_DATE"]][, c("NUM_INV_MOIS","LB_MOIS","NUM_INV_TRIM","LB_TRIM", "LIB_ANNEE"):=NULL] | ||
|
||
|
||
### OUTLIERS IN STO_FOUR_ORI and ART_DIS_SURFAMILLE | ||
# remove outliers in tables STO_FOUR_ORI & ART_DIS_SURFAMILLE. No apparent outliers in ART_DIS_FAMILLE | ||
# rule: remove all rows zhere QTE_NETTE or POIDS_NET is > 1e7 | ||
aida_agg[["STO_FOUR_ORI"]] <- aida_agg[["STO_FOUR_ORI"]][aida_agg[["STO_FOUR_ORI"]]$QTE_NETTE < 1e7 & aida_agg[["STO_FOUR_ORI"]]$POIDS_NET < 1e7,] | ||
aida_agg[["ART_DIS_SURFAMILLE"]] <- aida_agg[["ART_DIS_SURFAMILLE"]][aida_agg[["ART_DIS_SURFAMILLE"]]$QTE_NETTE < 1e7 & aida_agg[["ART_DIS_SURFAMILLE"]]$POIDS_NET < 1e7,] | ||
|
||
|
||
|
||
### NAs | ||
# look for all the tables with NAs | ||
search_nas(aida_agg) | ||
|
||
ginfo(aida_agg[["BEN_DUREE_DROIT"]]) | ||
ginfo(aida_agg[["DIM_HIER_STR"]]) | ||
ginfo(aida_agg[["DIM_U2A"]]) | ||
|
||
aida_agg[["BEN_DUREE_DROIT"]][!complete.cases(aida_agg[["BEN_DUREE_DROIT"]])] | ||
aida_agg[["DIM_HIER_STR"]][!complete.cases(aida_agg[["DIM_HIER_STR"]])] | ||
aida_agg[["DIM_U2A"]][!complete.cases(aida_agg[["DIM_U2A"]])] | ||
|
||
colinfo(aida_agg[["BEN_DUREE_DROIT"]]) | ||
colinfo(aida_agg[["DIM_HIER_STR"]]) | ||
colinfo(aida_agg[["DIM_U2A"]]) | ||
|
||
### DUPLICATES | ||
#look for all the tables with duplicates | ||
search_dup(aida_agg) | ||
|
||
### REGLER PB ENCODAGE | ||
print("reglage des problemes d'encodage") | ||
for(nm in names(aida_agg)) | ||
aida_agg[[nm]] <- solve_encoding(aida_agg[[nm]]) | ||
|
||
### STO_FOUR_ORI | ||
#remove duplicates from the table "STO_FOUR_ORI" | ||
aida_agg[["STO_FOUR_ORI"]] <- aida_agg[["STO_FOUR_ORI"]][!duplicated(aida_agg[["STO_FOUR_ORI"]])] | ||
#incoherence avec le Centre distribution Pont de Beauvoisin ==> actions a prendre ? | ||
aida_agg[["DIM_U2A"]][aida_agg[["DIM_U2A"]]$LB_U2A == "Centre distribution Pont de Beauvoisin"] | ||
#repertorier les centres non labellises: "A DEFINIR" | ||
cdu2a_non_lb = aida_agg[["DIM_U2A"]][sapply(aida_agg[["DIM_U2A"]]$LB_U2A, FUN = function(x){substring(x,1,9) == "A DEFINIR"} ) ] | ||
|
||
|
||
### Look for duplicates in 2 similar data tables, remove them and create a new unified data table | ||
print("concat similar tables") | ||
|
||
#BEN_CHARGES_FOURCH & BEN_CHARGES_FOURCH_N | ||
print("BEN_CHARGES_FOURCH & BEN_CHARGES_FOURCH_N") | ||
aida_agg[["BEN_CHARGES_FOURCH"]] <- search_dup(aida_agg[["BEN_CHARGES_FOURCH"]], aida_agg[["BEN_CHARGES_FOURCH_N"]], remove = T) | ||
aida_agg[["BEN_CHARGES_FOURCH_N"]] <- NULL | ||
|
||
#BEN_CHARGES_NB_FICHES & BEN_CHARGES_NB_FICHES_N | ||
print("BEN_CHARGES_NB_FICHES & BEN_CHARGES_NB_FICHES_N") | ||
aida_agg[["BEN_CHARGES_NB_FICHES"]] <- search_dup(aida_agg[["BEN_CHARGES_NB_FICHES"]], aida_agg[["BEN_CHARGES_NB_FICHES_N"]], remove = T) | ||
aida_agg[["BEN_CHARGES_NB_FICHES_N"]] <- NULL | ||
|
||
#BEN_DUREE_FOYER & BEN_DUREE_FOYER_N | ||
print("BEN_DUREE_FOYER & BEN_DUREE_FOYER_N") | ||
aida_agg[["BEN_DUREE_FOYER"]] <- search_dup(aida_agg[["BEN_DUREE_FOYER"]], aida_agg[["BEN_DUREE_FOYER_N"]], remove = T) | ||
aida_agg[["BEN_DUREE_FOYER_N"]] <- NULL | ||
|
||
#BEN_REVENUS_FOURCH & BEN_REVENUS_FOURCH_N & BEN_REVENUS_FOURCH_NEW | ||
# !! BEN_REVENUS_FOURCH has different columns from the other two tables. | ||
# we therefore concatenate only BEN_REVENUS_FOURCH_N & BEN_REVENUS_FOURCH_NEW | ||
# we delete BEN_REVENUS_FOURCH_N | ||
print("BEN_REVENUS_FOURCH & BEN_REVENUS_FOURCH_N & BEN_REVENUS_FOURCH_NEW") | ||
aida_agg[["BEN_REVENUS_FOURCH_NEW"]] <- search_dup(aida_agg[["BEN_REVENUS_FOURCH_N"]], aida_agg[["BEN_REVENUS_FOURCH_NEW"]], remove = T) | ||
aida_agg[["BEN_REVENUS_FOURCH_N"]] <- NULL | ||
|
||
|
||
#BEN_REVENUS_NB_FICHES & BEN_REVENUS_NB_FICHES_N | ||
print("BEN_REVENUS_NB_FICHES & BEN_REVENUS_NB_FICHES_N") | ||
aida_agg[["BEN_REVENUS_NB_FICHES"]] <- search_dup(aida_agg[["BEN_REVENUS_NB_FICHES"]], aida_agg[["BEN_REVENUS_NB_FICHES_N"]], remove = T) | ||
aida_agg[["BEN_REVENUS_NB_FICHES_N"]] <- NULL | ||
|
||
#BEN_SEXE_AYANTDROIT & BEN_SEXE_AYANTDROIT_N | ||
print("BEN_SEXE_AYANTDROIT & BEN_SEXE_AYANTDROIT_N") | ||
aida_agg[["BEN_SEXE_AYANTDROIT"]] <- search_dup(aida_agg[["BEN_SEXE_AYANTDROIT"]], aida_agg[["BEN_SEXE_AYANTDROIT_N"]], remove = T) | ||
aida_agg[["BEN_SEXE_AYANTDROIT_N"]] <- NULL | ||
|
||
#BEN_SEXE_CHEFFOYER & BEN_SEXE_CHEFFOYER_N | ||
print("BEN_SEXE_CHEFFOYER & BEN_SEXE_CHEFFOYER_N") | ||
aida_agg[["BEN_SEXE_CHEFFOYER"]] <- search_dup(aida_agg[["BEN_SEXE_CHEFFOYER"]], aida_agg[["BEN_SEXE_CHEFFOYER_N"]], remove = T) | ||
aida_agg[["BEN_SEXE_CHEFFOYER_N"]] <- NULL | ||
|
||
#BEN_STATUT_FOYER & BEN_STATUT_FOYER_N | ||
print("BEN_STATUT_FOYER & BEN_STATUT_FOYER_N") | ||
aida_agg[["BEN_STATUT_FOYER"]] <- search_dup(aida_agg[["BEN_STATUT_FOYER"]], aida_agg[["BEN_STATUT_FOYER_N"]], remove = T) | ||
aida_agg[["BEN_STATUT_FOYER_N"]] <- NULL | ||
|
||
#BEN_TYPO_FOYER & BEN_TYPO_FOYER_N | ||
print("BEN_TYPO_FOYER & BEN_TYPO_FOYER_N") | ||
aida_agg[["BEN_TYPO_FOYER"]] <- search_dup(aida_agg[["BEN_TYPO_FOYER"]], aida_agg[["BEN_TYPO_FOYER_N"]], remove = T) | ||
aida_agg[["BEN_TYPO_FOYER_N"]] <- NULL | ||
|
||
#BEN_TYP_FOYER & BEN_TYP_FOYER_N | ||
print("BEN_TYP_FOYER & BEN_TYP_FOYER_N") | ||
aida_agg[["BEN_TYP_FOYER"]] <- search_dup(aida_agg[["BEN_TYP_FOYER"]], aida_agg[["BEN_TYP_FOYER_N"]], remove = T) | ||
aida_agg[["BEN_TYP_FOYER_N"]] <- NULL | ||
|
||
### Remove the DIM_SEXE table and replace the code by "H", "F", "HF" | ||
tbles = check_col(aida_agg, "CD_SEXE")[!(check_col(aida_agg, "CD_SEXE") %in% "DIM_SEXE")] | ||
for(nm in tbles) | ||
{ | ||
aida_agg[[nm]][, "SEXE":=as.character(sapply(.SD, | ||
FUN = function(x){sapply(x, FUN = function(y) {as.character(y);if(y == 0) "HF" else if(y == 1) "H" else if(y == 2) "F"})})), .SDcols = "CD_SEXE"] | ||
print(class(aida_agg[[nm]]$SEXE)) | ||
aida_agg[[nm]][, CD_SEXE:=NULL] | ||
} | ||
aida_agg[["DIM_SEXE"]] <- NULL | ||
|
||
### "DIM_TYPO_FOYER_BEN" --> delete the first category which is irrelevant | ||
check_col(aida_agg, "CD_TYPO_FOYER_BEN") | ||
aida_agg[["DIM_TYPO_FOYER_BEN"]] <- aida_agg[["DIM_TYPO_FOYER_BEN"]][2:nrow(aida_agg[["DIM_TYPO_FOYER_BEN"]]), ] |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,159 @@ | ||
#functions | ||
# ginfo <- function(dt) | ||
# colinfo <- function(dt) | ||
# search_nas <- function(dts) | ||
# search_dup <- function(dts, dt2 = NULL, remove = F) | ||
# solve_encoding <- function(dt) | ||
# check_col <- function(dts, col_names) | ||
# info_date <- function(dts, name_month) | ||
# r_to_sql: create an sql database from a list of R data tables | ||
# sql_to_r: create a list of R data tables from an sql database | ||
|
||
|
||
### exp_tools.R | ||
### tool functions for exploring databases. | ||
|
||
# for tables where there is the date information (month), check : | ||
#period covered | ||
#missing months | ||
#number of quotes per date | ||
info_date <- function(dt, name_month = "ID_MOIS") | ||
{ | ||
check_col(aida_agg, name_month) | ||
start_date = as.character(min(dt[[name_month]])) | ||
starty = as.numeric(substr(start_date,1,4)) | ||
startm = substr(start_date,5,6) | ||
end_date = as.character(max(dt[[name_month]])) | ||
endy = substr(end_date,1,4) | ||
endm = substr(end_date,5,6) | ||
dates = numeric() | ||
months = c("01","02","03","04","05","06","07","08","09","10","11","12") | ||
years = as.character(starty:endy) | ||
for(yr in years) | ||
dates = c(dates, as.numeric(paste0(yr,months))) | ||
dates <- as.character(dates[dates >= start_date & dates <= end_date]) | ||
|
||
ret = data.table(ID_MOIS = dates, | ||
NB_QUOTES = sapply(dates, FUN = function(x){ sum(dt[[name_month]] == x)})) | ||
ret | ||
} | ||
|
||
glob_info_date <- function(dts = aida_agg) | ||
{ | ||
|
||
tbles = check_col("ID_MOIS") | ||
start = character() | ||
end = character() | ||
miss = numeric() | ||
miss_months = character() | ||
for(nm in tbles) | ||
{ | ||
inf_date = info_date(dts[[nm]]) | ||
start = c(start, inf_date[1, "ID_MOIS", with = F]) | ||
end = c(end, inf_date[nrow(inf_date), "ID_MOIS", with = F]) | ||
miss = c(miss, sum(inf_date$NB_QUOTES == 0)) | ||
miss_months = c(miss_months, paste(inf_date$ID_MOIS[inf_date$NB_QUOTES == 0], collapse = ', ')) | ||
} | ||
|
||
ret = data.table(TABLE = tbles, | ||
START = start, | ||
END = end, | ||
NB_MISS_MONTH = miss, | ||
MISSING_MONTH = miss_months) | ||
} | ||
|
||
|
||
|
||
#global information on a data table | ||
ginfo <- function(dt) | ||
{ | ||
ret = data.table(NB_QUOTES = nrow(dt), | ||
NB_DUP = sum(duplicated(dt)), | ||
PROP_DUP = sprintf("%.1f %%", 100*sum(duplicated(dt))/nrow(dt)) , | ||
NB_INCOMPLETE = sum(!complete.cases(dt)) , | ||
PROP_INCOMPLETE = sprintf("%.1f %%", 100*sum(!complete.cases(dt))/nrow(dt))) | ||
ret | ||
} | ||
|
||
#information about each column of the data table | ||
colinfo <- function(dt) | ||
{ | ||
NB_NA = sapply(dt, FUN = function(x) { sum(is.na(x))}) | ||
PROP_NA = sapply(dt, FUN = function(x) { sprintf("%.1f %%", 100* sum(is.na(x))/nrow(dt))}) | ||
NB_UNIQUE = sapply(dt, FUN = function(x) { length(unique(na.omit(x))) }) | ||
PROP_UNIQUE = sapply(dt, FUN = function(x) { sprintf("%.1f %%",100 * length(unique(na.omit(x)))/ nrow(dt))}) | ||
|
||
COL = colnames(dt) | ||
ret = data.table(COL, NB_NA, PROP_NA, NB_UNIQUE,PROP_UNIQUE, keep.rownames = T) | ||
ret | ||
} | ||
|
||
#look for all the data tables that contain NAs | ||
search_nas <- function(dts) | ||
{ | ||
tables = character() | ||
for(nm in names(dts)) | ||
if(sum(is.na(dts[[nm]])) != 0) | ||
tables <- c(tables, nm) | ||
tables | ||
} | ||
|
||
|
||
#look for all the data tables that contain duplicates | ||
search_dup <- function(dts, dt2 = NULL, remove = F) | ||
{ | ||
if(is.null(dt2)) | ||
{ | ||
|
||
tables = character() | ||
for(nm in names(dts)) | ||
{ | ||
if(sum(duplicated(dts[[nm]])) != 0) | ||
tables <- c(tables, nm) | ||
} | ||
|
||
tables | ||
} | ||
else | ||
{ | ||
tp = rbind(dts, dt2) | ||
print(ginfo(tp)) | ||
if(remove) | ||
return(tp[!duplicated(tp)]) | ||
else | ||
return(tp[duplicated(tp)]) | ||
} | ||
} | ||
|
||
#solve encoding problems in a column | ||
solve_encoding <- function(dt) | ||
{ | ||
for(cl in colnames(dt)) | ||
if(class(dt[[cl]]) == "character") | ||
{ | ||
#e | ||
dt[[cl]] <- gsub("(\xe8)|(\xe9)|(\xea)","e",dt[[cl]]) | ||
# o | ||
dt[[cl]] <- gsub("(\U3e34663c)", "o", dt[[cl]]) | ||
dt[[cl]] <- gsub("(\xf4)", "o", dt[[cl]]) | ||
# a | ||
dt[[cl]] <- gsub("\xe2","a", dt[[cl]]) | ||
# i | ||
dt[[cl]] <- gsub("\xee", "i", dt[[cl]]) | ||
# c | ||
dt[[cl]] <- gsub("\xe7", "c", dt[[cl]]) | ||
} | ||
dt | ||
} | ||
|
||
#check_col looks in which data tables the cols are present | ||
check_col <- function(col_names, dts = aida_agg) | ||
{ | ||
tables = character() | ||
for(nm in names(dts)) | ||
{ | ||
if(sum(col_names %in% names(dts[[nm]])) == length(col_names)) | ||
tables <- c(tables, nm) | ||
} | ||
tables | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
### sql_tools.R | ||
### tool function for writing to an sql file from R data tables and vis vers ca. | ||
|
||
#to use this function, you need to install MySQL | ||
sql_to_r <- function(dbname, user, password, host = 'localhost', rdata_path = NULL, sql_file = NULL) | ||
{ | ||
require(RMySQL) | ||
|
||
#connect to the database | ||
mydb = dbConnect(MySQL(), user = user, password = password, host = host) | ||
if(!is.null(sql_file)) | ||
{ | ||
dbSendQuery(mydb, paste0('CREATE DATABASE IF NOT EXISTS ', dbname)) | ||
system(paste0('mysql -u ', user,' -p',password,' ', dbname ,' < ',sql_file)) | ||
} | ||
|
||
dbSendQuery(mydb, paste0("USE ", dbname)) | ||
tables_name = dbListTables(mydb) | ||
|
||
#fill the data table list | ||
dt_list = list() | ||
for(nm in tables_name) | ||
{ | ||
if(substr(nm,1,1) == "#") | ||
next | ||
dt_list[[nm]] = data.table(dbReadTable(mydb, nm)) | ||
} | ||
|
||
#save the list of data tables to RData format | ||
if(!is.null(rdata_path)) | ||
save(list = "dt_list", file = rdata_path) | ||
|
||
#disconnect from the database | ||
dbDisconnect(mydb) | ||
|
||
#return the R datatable list | ||
dt_list | ||
} | ||
|
||
|
||
r_to_sql <- function(dt_list, dbname, user, password, host = 'localhost', sqlfile_path = NULL) | ||
{ | ||
require(RMySQL) | ||
|
||
#connect to the database | ||
mydb = dbConnect(MySQL(), user = user, password = password, host = host) | ||
|
||
dbSendQuery(mydb, paste0('DROP DATABASE ', dbname)) | ||
dbSendQuery(mydb, paste0('CREATE DATABASE ', dbname)) | ||
dbSendQuery(mydb, paste0("USE ", dbname)) | ||
|
||
tables_name = names(dt_list) | ||
|
||
for(nm in tables_name) | ||
dbWriteTable(mydb, name = nm, value = dt_list[[nm]], overwrite = TRUE) | ||
|
||
#write a new sql file | ||
if(!is.null(sqlfile_path)) | ||
system(paste0('mysqldump -u ',user,' -p',password, ' ', dbname ,' > ',sqlfile_path)) | ||
|
||
#disconnect from the database | ||
dbDisconnect(mydb) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,28 @@ | ||
rm(list = ls()) | ||
source("~/Desktop/croixrouge/R/tools/exp_tools.R") | ||
source("~/Desktop/croixrouge/R/tools/sql_tools.R") | ||
require(data.table) | ||
|
||
|
||
#get the sql file | ||
print('sql to R:') | ||
aida_agg = sql_to_r(dbname = 'aida_agg', sql_file = '/home/selim/Desktop/tests/2015-04-03-db-aida_pri.sql', | ||
user = 'root', password = '1234', host = 'localhost', rdata_path = NULL) | ||
|
||
|
||
#cleaning script | ||
print('clean the script:') | ||
source('/home/selim/Desktop/croixrouge/R/data_cleaning.R') | ||
|
||
|
||
#bring back the r data to sql format | ||
print('r to sql:') | ||
r_to_sql(dt_list = aida_agg, dbname = 'aida_agg', | ||
user = 'root', password = '1234', host = 'localhost', sqlfile_path = '~/Desktop/tests/2015-06-02-db-aida_pri.sql') | ||
|
||
|
||
#zip the file | ||
system('cd ~/Desktop/tests ; zip --password vincalmeouhumidechien 2015-06-02-db-aida_pri.sql.zip 2015-06-02-db-aida_pri.sql') | ||
|
||
#save the new database to R format | ||
save(list = 'aida_agg', file = '/home/selim/Desktop/croixrouge/R/RData/aida_agg.RData') |
Binary file not shown.
Oops, something went wrong.