Skip to content

Commit

Permalink
nettoyage de la base agregee AIDA
Browse files Browse the repository at this point in the history
  • Loading branch information
selimrbd committed Jun 2, 2015
1 parent 875806b commit b9f5a4a
Show file tree
Hide file tree
Showing 7 changed files with 419 additions and 0 deletions.
Binary file not shown.
130 changes: 130 additions & 0 deletions R/data_cleaning.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,130 @@
require(data.table)


### DIM_STR_DEPT
#delete the empty DIM_STR_DEPT table
aida_agg[["DIM_STR_DEPT"]] <- NULL

### DIM_DATE
# delete the following unnecessary rows in DIM_DATE
# NUM_INV_MOIS
# LB_MOIS
# NUM_INV_TRIM
# LB_TRIM
# LIB_ANNEE
aida_agg[["DIM_DATE"]][, c("NUM_INV_MOIS","LB_MOIS","NUM_INV_TRIM","LB_TRIM", "LIB_ANNEE"):=NULL]


### OUTLIERS IN STO_FOUR_ORI and ART_DIS_SURFAMILLE
# remove outliers in tables STO_FOUR_ORI & ART_DIS_SURFAMILLE. No apparent outliers in ART_DIS_FAMILLE
# rule: remove all rows zhere QTE_NETTE or POIDS_NET is > 1e7
aida_agg[["STO_FOUR_ORI"]] <- aida_agg[["STO_FOUR_ORI"]][aida_agg[["STO_FOUR_ORI"]]$QTE_NETTE < 1e7 & aida_agg[["STO_FOUR_ORI"]]$POIDS_NET < 1e7,]
aida_agg[["ART_DIS_SURFAMILLE"]] <- aida_agg[["ART_DIS_SURFAMILLE"]][aida_agg[["ART_DIS_SURFAMILLE"]]$QTE_NETTE < 1e7 & aida_agg[["ART_DIS_SURFAMILLE"]]$POIDS_NET < 1e7,]



### NAs
# look for all the tables with NAs
search_nas(aida_agg)

ginfo(aida_agg[["BEN_DUREE_DROIT"]])
ginfo(aida_agg[["DIM_HIER_STR"]])
ginfo(aida_agg[["DIM_U2A"]])

aida_agg[["BEN_DUREE_DROIT"]][!complete.cases(aida_agg[["BEN_DUREE_DROIT"]])]
aida_agg[["DIM_HIER_STR"]][!complete.cases(aida_agg[["DIM_HIER_STR"]])]
aida_agg[["DIM_U2A"]][!complete.cases(aida_agg[["DIM_U2A"]])]

colinfo(aida_agg[["BEN_DUREE_DROIT"]])
colinfo(aida_agg[["DIM_HIER_STR"]])
colinfo(aida_agg[["DIM_U2A"]])

### DUPLICATES
#look for all the tables with duplicates
search_dup(aida_agg)

### REGLER PB ENCODAGE
print("reglage des problemes d'encodage")
for(nm in names(aida_agg))
aida_agg[[nm]] <- solve_encoding(aida_agg[[nm]])

### STO_FOUR_ORI
#remove duplicates from the table "STO_FOUR_ORI"
aida_agg[["STO_FOUR_ORI"]] <- aida_agg[["STO_FOUR_ORI"]][!duplicated(aida_agg[["STO_FOUR_ORI"]])]
#incoherence avec le Centre distribution Pont de Beauvoisin ==> actions a prendre ?
aida_agg[["DIM_U2A"]][aida_agg[["DIM_U2A"]]$LB_U2A == "Centre distribution Pont de Beauvoisin"]
#repertorier les centres non labellises: "A DEFINIR"
cdu2a_non_lb = aida_agg[["DIM_U2A"]][sapply(aida_agg[["DIM_U2A"]]$LB_U2A, FUN = function(x){substring(x,1,9) == "A DEFINIR"} ) ]


### Look for duplicates in 2 similar data tables, remove them and create a new unified data table
print("concat similar tables")

#BEN_CHARGES_FOURCH & BEN_CHARGES_FOURCH_N
print("BEN_CHARGES_FOURCH & BEN_CHARGES_FOURCH_N")
aida_agg[["BEN_CHARGES_FOURCH"]] <- search_dup(aida_agg[["BEN_CHARGES_FOURCH"]], aida_agg[["BEN_CHARGES_FOURCH_N"]], remove = T)
aida_agg[["BEN_CHARGES_FOURCH_N"]] <- NULL

#BEN_CHARGES_NB_FICHES & BEN_CHARGES_NB_FICHES_N
print("BEN_CHARGES_NB_FICHES & BEN_CHARGES_NB_FICHES_N")
aida_agg[["BEN_CHARGES_NB_FICHES"]] <- search_dup(aida_agg[["BEN_CHARGES_NB_FICHES"]], aida_agg[["BEN_CHARGES_NB_FICHES_N"]], remove = T)
aida_agg[["BEN_CHARGES_NB_FICHES_N"]] <- NULL

#BEN_DUREE_FOYER & BEN_DUREE_FOYER_N
print("BEN_DUREE_FOYER & BEN_DUREE_FOYER_N")
aida_agg[["BEN_DUREE_FOYER"]] <- search_dup(aida_agg[["BEN_DUREE_FOYER"]], aida_agg[["BEN_DUREE_FOYER_N"]], remove = T)
aida_agg[["BEN_DUREE_FOYER_N"]] <- NULL

#BEN_REVENUS_FOURCH & BEN_REVENUS_FOURCH_N & BEN_REVENUS_FOURCH_NEW
# !! BEN_REVENUS_FOURCH has different columns from the other two tables.
# we therefore concatenate only BEN_REVENUS_FOURCH_N & BEN_REVENUS_FOURCH_NEW
# we delete BEN_REVENUS_FOURCH_N
print("BEN_REVENUS_FOURCH & BEN_REVENUS_FOURCH_N & BEN_REVENUS_FOURCH_NEW")
aida_agg[["BEN_REVENUS_FOURCH_NEW"]] <- search_dup(aida_agg[["BEN_REVENUS_FOURCH_N"]], aida_agg[["BEN_REVENUS_FOURCH_NEW"]], remove = T)
aida_agg[["BEN_REVENUS_FOURCH_N"]] <- NULL


#BEN_REVENUS_NB_FICHES & BEN_REVENUS_NB_FICHES_N
print("BEN_REVENUS_NB_FICHES & BEN_REVENUS_NB_FICHES_N")
aida_agg[["BEN_REVENUS_NB_FICHES"]] <- search_dup(aida_agg[["BEN_REVENUS_NB_FICHES"]], aida_agg[["BEN_REVENUS_NB_FICHES_N"]], remove = T)
aida_agg[["BEN_REVENUS_NB_FICHES_N"]] <- NULL

#BEN_SEXE_AYANTDROIT & BEN_SEXE_AYANTDROIT_N
print("BEN_SEXE_AYANTDROIT & BEN_SEXE_AYANTDROIT_N")
aida_agg[["BEN_SEXE_AYANTDROIT"]] <- search_dup(aida_agg[["BEN_SEXE_AYANTDROIT"]], aida_agg[["BEN_SEXE_AYANTDROIT_N"]], remove = T)
aida_agg[["BEN_SEXE_AYANTDROIT_N"]] <- NULL

#BEN_SEXE_CHEFFOYER & BEN_SEXE_CHEFFOYER_N
print("BEN_SEXE_CHEFFOYER & BEN_SEXE_CHEFFOYER_N")
aida_agg[["BEN_SEXE_CHEFFOYER"]] <- search_dup(aida_agg[["BEN_SEXE_CHEFFOYER"]], aida_agg[["BEN_SEXE_CHEFFOYER_N"]], remove = T)
aida_agg[["BEN_SEXE_CHEFFOYER_N"]] <- NULL

#BEN_STATUT_FOYER & BEN_STATUT_FOYER_N
print("BEN_STATUT_FOYER & BEN_STATUT_FOYER_N")
aida_agg[["BEN_STATUT_FOYER"]] <- search_dup(aida_agg[["BEN_STATUT_FOYER"]], aida_agg[["BEN_STATUT_FOYER_N"]], remove = T)
aida_agg[["BEN_STATUT_FOYER_N"]] <- NULL

#BEN_TYPO_FOYER & BEN_TYPO_FOYER_N
print("BEN_TYPO_FOYER & BEN_TYPO_FOYER_N")
aida_agg[["BEN_TYPO_FOYER"]] <- search_dup(aida_agg[["BEN_TYPO_FOYER"]], aida_agg[["BEN_TYPO_FOYER_N"]], remove = T)
aida_agg[["BEN_TYPO_FOYER_N"]] <- NULL

#BEN_TYP_FOYER & BEN_TYP_FOYER_N
print("BEN_TYP_FOYER & BEN_TYP_FOYER_N")
aida_agg[["BEN_TYP_FOYER"]] <- search_dup(aida_agg[["BEN_TYP_FOYER"]], aida_agg[["BEN_TYP_FOYER_N"]], remove = T)
aida_agg[["BEN_TYP_FOYER_N"]] <- NULL

### Remove the DIM_SEXE table and replace the code by "H", "F", "HF"
tbles = check_col(aida_agg, "CD_SEXE")[!(check_col(aida_agg, "CD_SEXE") %in% "DIM_SEXE")]
for(nm in tbles)
{
aida_agg[[nm]][, "SEXE":=as.character(sapply(.SD,
FUN = function(x){sapply(x, FUN = function(y) {as.character(y);if(y == 0) "HF" else if(y == 1) "H" else if(y == 2) "F"})})), .SDcols = "CD_SEXE"]
print(class(aida_agg[[nm]]$SEXE))
aida_agg[[nm]][, CD_SEXE:=NULL]
}
aida_agg[["DIM_SEXE"]] <- NULL

### "DIM_TYPO_FOYER_BEN" --> delete the first category which is irrelevant
check_col(aida_agg, "CD_TYPO_FOYER_BEN")
aida_agg[["DIM_TYPO_FOYER_BEN"]] <- aida_agg[["DIM_TYPO_FOYER_BEN"]][2:nrow(aida_agg[["DIM_TYPO_FOYER_BEN"]]), ]
159 changes: 159 additions & 0 deletions R/tools/exp_tools.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
#functions
# ginfo <- function(dt)
# colinfo <- function(dt)
# search_nas <- function(dts)
# search_dup <- function(dts, dt2 = NULL, remove = F)
# solve_encoding <- function(dt)
# check_col <- function(dts, col_names)
# info_date <- function(dts, name_month)
# r_to_sql: create an sql database from a list of R data tables
# sql_to_r: create a list of R data tables from an sql database


### exp_tools.R
### tool functions for exploring databases.

# for tables where there is the date information (month), check :
#period covered
#missing months
#number of quotes per date
info_date <- function(dt, name_month = "ID_MOIS")
{
check_col(aida_agg, name_month)
start_date = as.character(min(dt[[name_month]]))
starty = as.numeric(substr(start_date,1,4))
startm = substr(start_date,5,6)
end_date = as.character(max(dt[[name_month]]))
endy = substr(end_date,1,4)
endm = substr(end_date,5,6)
dates = numeric()
months = c("01","02","03","04","05","06","07","08","09","10","11","12")
years = as.character(starty:endy)
for(yr in years)
dates = c(dates, as.numeric(paste0(yr,months)))
dates <- as.character(dates[dates >= start_date & dates <= end_date])

ret = data.table(ID_MOIS = dates,
NB_QUOTES = sapply(dates, FUN = function(x){ sum(dt[[name_month]] == x)}))
ret
}

glob_info_date <- function(dts = aida_agg)
{

tbles = check_col("ID_MOIS")
start = character()
end = character()
miss = numeric()
miss_months = character()
for(nm in tbles)
{
inf_date = info_date(dts[[nm]])
start = c(start, inf_date[1, "ID_MOIS", with = F])
end = c(end, inf_date[nrow(inf_date), "ID_MOIS", with = F])
miss = c(miss, sum(inf_date$NB_QUOTES == 0))
miss_months = c(miss_months, paste(inf_date$ID_MOIS[inf_date$NB_QUOTES == 0], collapse = ', '))
}

ret = data.table(TABLE = tbles,
START = start,
END = end,
NB_MISS_MONTH = miss,
MISSING_MONTH = miss_months)
}



#global information on a data table
ginfo <- function(dt)
{
ret = data.table(NB_QUOTES = nrow(dt),
NB_DUP = sum(duplicated(dt)),
PROP_DUP = sprintf("%.1f %%", 100*sum(duplicated(dt))/nrow(dt)) ,
NB_INCOMPLETE = sum(!complete.cases(dt)) ,
PROP_INCOMPLETE = sprintf("%.1f %%", 100*sum(!complete.cases(dt))/nrow(dt)))
ret
}

#information about each column of the data table
colinfo <- function(dt)
{
NB_NA = sapply(dt, FUN = function(x) { sum(is.na(x))})
PROP_NA = sapply(dt, FUN = function(x) { sprintf("%.1f %%", 100* sum(is.na(x))/nrow(dt))})
NB_UNIQUE = sapply(dt, FUN = function(x) { length(unique(na.omit(x))) })
PROP_UNIQUE = sapply(dt, FUN = function(x) { sprintf("%.1f %%",100 * length(unique(na.omit(x)))/ nrow(dt))})

COL = colnames(dt)
ret = data.table(COL, NB_NA, PROP_NA, NB_UNIQUE,PROP_UNIQUE, keep.rownames = T)
ret
}

#look for all the data tables that contain NAs
search_nas <- function(dts)
{
tables = character()
for(nm in names(dts))
if(sum(is.na(dts[[nm]])) != 0)
tables <- c(tables, nm)
tables
}


#look for all the data tables that contain duplicates
search_dup <- function(dts, dt2 = NULL, remove = F)
{
if(is.null(dt2))
{

tables = character()
for(nm in names(dts))
{
if(sum(duplicated(dts[[nm]])) != 0)
tables <- c(tables, nm)
}

tables
}
else
{
tp = rbind(dts, dt2)
print(ginfo(tp))
if(remove)
return(tp[!duplicated(tp)])
else
return(tp[duplicated(tp)])
}
}

#solve encoding problems in a column
solve_encoding <- function(dt)
{
for(cl in colnames(dt))
if(class(dt[[cl]]) == "character")
{
#e
dt[[cl]] <- gsub("(\xe8)|(\xe9)|(\xea)","e",dt[[cl]])
# o
dt[[cl]] <- gsub("(\U3e34663c)", "o", dt[[cl]])
dt[[cl]] <- gsub("(\xf4)", "o", dt[[cl]])
# a
dt[[cl]] <- gsub("\xe2","a", dt[[cl]])
# i
dt[[cl]] <- gsub("\xee", "i", dt[[cl]])
# c
dt[[cl]] <- gsub("\xe7", "c", dt[[cl]])
}
dt
}

#check_col looks in which data tables the cols are present
check_col <- function(col_names, dts = aida_agg)
{
tables = character()
for(nm in names(dts))
{
if(sum(col_names %in% names(dts[[nm]])) == length(col_names))
tables <- c(tables, nm)
}
tables
}
63 changes: 63 additions & 0 deletions R/tools/sql_tools.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
### sql_tools.R
### tool function for writing to an sql file from R data tables and vis vers ca.

#to use this function, you need to install MySQL
sql_to_r <- function(dbname, user, password, host = 'localhost', rdata_path = NULL, sql_file = NULL)
{
require(RMySQL)

#connect to the database
mydb = dbConnect(MySQL(), user = user, password = password, host = host)
if(!is.null(sql_file))
{
dbSendQuery(mydb, paste0('CREATE DATABASE IF NOT EXISTS ', dbname))
system(paste0('mysql -u ', user,' -p',password,' ', dbname ,' < ',sql_file))
}

dbSendQuery(mydb, paste0("USE ", dbname))
tables_name = dbListTables(mydb)

#fill the data table list
dt_list = list()
for(nm in tables_name)
{
if(substr(nm,1,1) == "#")
next
dt_list[[nm]] = data.table(dbReadTable(mydb, nm))
}

#save the list of data tables to RData format
if(!is.null(rdata_path))
save(list = "dt_list", file = rdata_path)

#disconnect from the database
dbDisconnect(mydb)

#return the R datatable list
dt_list
}


r_to_sql <- function(dt_list, dbname, user, password, host = 'localhost', sqlfile_path = NULL)
{
require(RMySQL)

#connect to the database
mydb = dbConnect(MySQL(), user = user, password = password, host = host)

dbSendQuery(mydb, paste0('DROP DATABASE ', dbname))
dbSendQuery(mydb, paste0('CREATE DATABASE ', dbname))
dbSendQuery(mydb, paste0("USE ", dbname))

tables_name = names(dt_list)

for(nm in tables_name)
dbWriteTable(mydb, name = nm, value = dt_list[[nm]], overwrite = TRUE)

#write a new sql file
if(!is.null(sqlfile_path))
system(paste0('mysqldump -u ',user,' -p',password, ' ', dbname ,' > ',sqlfile_path))

#disconnect from the database
dbDisconnect(mydb)
}
28 changes: 28 additions & 0 deletions R/workflow_clean.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
rm(list = ls())
source("~/Desktop/croixrouge/R/tools/exp_tools.R")
source("~/Desktop/croixrouge/R/tools/sql_tools.R")
require(data.table)


#get the sql file
print('sql to R:')
aida_agg = sql_to_r(dbname = 'aida_agg', sql_file = '/home/selim/Desktop/tests/2015-04-03-db-aida_pri.sql',
user = 'root', password = '1234', host = 'localhost', rdata_path = NULL)


#cleaning script
print('clean the script:')
source('/home/selim/Desktop/croixrouge/R/data_cleaning.R')


#bring back the r data to sql format
print('r to sql:')
r_to_sql(dt_list = aida_agg, dbname = 'aida_agg',
user = 'root', password = '1234', host = 'localhost', sqlfile_path = '~/Desktop/tests/2015-06-02-db-aida_pri.sql')


#zip the file
system('cd ~/Desktop/tests ; zip --password vincalmeouhumidechien 2015-06-02-db-aida_pri.sql.zip 2015-06-02-db-aida_pri.sql')

#save the new database to R format
save(list = 'aida_agg', file = '/home/selim/Desktop/croixrouge/R/RData/aida_agg.RData')
Binary file not shown.
Loading

0 comments on commit b9f5a4a

Please sign in to comment.