Skip to content

Commit

Permalink
Added description to data wrangling scripts. Now only do monthly data.
Browse files Browse the repository at this point in the history
  • Loading branch information
lbrandt committed Feb 18, 2018
1 parent 32b8982 commit 5fd5d36
Show file tree
Hide file tree
Showing 3 changed files with 158 additions and 189 deletions.
48 changes: 6 additions & 42 deletions MATLAB/ez_import_data.m
Original file line number Diff line number Diff line change
@@ -1,15 +1,14 @@
% -------------------------------------------------------------------------
% Import Eurozone data
%
%
%
% Imports Eurozone data that was previously cleaned and transformed in R
% into MATLAB and saves balanced panel of macroeconomic series to .mat
% workspace (version 7.3).
%
% Lennart Brandt, Feb 2018
% -------------------------------------------------------------------------

%clear; clc;
%addpath('..\R;..\MATLAB')
%addpath('..\R;..\MATLAB;')

% EZ data monthly

%h5disp('ez_data.h5')
names = h5read('ez_data.h5', '/names');
Expand Down Expand Up @@ -41,39 +40,4 @@
dates = dates(2:end);
x = x(2:end, :);

save ez_data names dates x




% EZ data QUARTERLY
%h5disp('ez_data90_q.h5')
names = h5read('ez_data90_q.h5', '/varnames');
dates = datetime(h5read('ez_data90_q.h5', '/dates'), 'InputFormat', 'yyyyQQQ', 'Format', 'yyyyQQQ');
x = h5read('ez_data90_q.h5', '/dlndata')';

[T, N] = size(x);

fprintf('Quarterly series from %s to %s \n', datestr(dates(1)), datestr(dates(end)));
fprintf('Sample: T = %d, N = %d \n', T, N);

% Format names such that they are permissible MATLAB varnames
names = strrep(names, ' ', '_');
names = strrep(names, '&', '_');
names = strrep(names, '/', '_');
names = strrep(names, '\', '_');
names = strrep(names, ':', '');
names = strrep(names, '.', '');
names = strrep(names, ',', '');
names = strrep(names, '+', '');
names = strrep(names, '-', '');
names = strrep(names, '<', '');
names = strrep(names, '>', '');
names = strrep(names, '(', '');
names = strrep(names, ')', '');
names = strrep(names, '', '');

% Shorten dates of diffed series
dates = dates(2:end);

save ez_data90_q names dates x
save ez_data -v7.3 names dates x
161 changes: 14 additions & 147 deletions R/ez_data.R
Original file line number Diff line number Diff line change
@@ -1,5 +1,16 @@
# EZ data

# -------------------------------------------------------------------------
# Loads time series on Eurozone economy from Datastream request file and
# transforms them according to accompanying meta data file. Saves results
# to HDF5 container for further use.
#
# Lennart Brandt, Feb 2018
# -------------------------------------------------------------------------

# Initialize CRAN packages

#install.packages("tidyverse")
#install.packages("readxl")
#install.packages("h5")
require(tidyverse)
require(readxl)
require(h5)
Expand All @@ -18,7 +29,7 @@ setwd(location.thisfile)
# Set location of data relative to working directory which contains this script
location.data = normalizePath(file.path("..", "..", "..", "Data"), winslash = "/")


View(ez_data$EMGBOND.)


# EZ data monthly --------------------------------------------------------
Expand All @@ -29,8 +40,6 @@ datastream_ez = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.
# Extract dates
source("dateShift.R")
dates = dateShift(datastream_ez[[1]], unit = 'month', rule = 'start')
#ta = first(dates)
#te = last(dates)


# Build dataset
Expand All @@ -50,8 +59,6 @@ for(i in 1:dim(ez_data)[2]){
attr(ez_data, "transformation")[i] = ez_meta$transform[which(ez_meta$code == names(ez_data)[i])]
}

attributes(ez_data)$transformation

ez_tdata = ez_data %>%
mutate_if(.predicate = attributes(ez_data)$transformation == 1, funs(. - lag(.))) %>% # Diffs
mutate_if(.predicate = attributes(ez_data)$transformation == 2, funs(log(.) - lag(log(.)))) # Logdiffs
Expand Down Expand Up @@ -84,143 +91,3 @@ out.file["/dates"] = out.dates
out.file["/data"] = out.data
out.file["/tdata"] = out.tdata
h5close(out.file)






# VAR data monthly --------------------------------------------------------
datastream_var = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.xls"),
sheet = "var", col_names = TRUE, na = "NA", skip = 1, guess_max = 6000)


# Extract dates
source("dateShift.R")
dates = dateShift(datastream_var[[1]], unit = 'month', rule = 'start')
#ta = first(dates)
#te = last(dates)


# Build dataset
ez_vardata = datastream_var %>%

rename(EMEBCPGS = "EMEBCPGS%") %>%
rename(EMTOTUNQ = "EMTOTUN%Q") %>%
rename(EKUNTOTQ = "EKUN%TOTQ") %>%

select(-starts_with("X__")) %>% # Remove empty columns
select(-starts_with("Code")) %>% # Remove date columns
select(-c(EKIMPPRCF)) %>% # Remove short series

mutate_at(vars(-c(EMPRATE., EMIBOR3., EMIBOR1Y, EMGBOND., OIEURSW, OIEUR2W, OIEUR1M, OIEUR3M, OIEUR10, OIEUR1Y, USEURSP, # Interest rates
EKEBUN..O, EMUNPTOTO, EKEBUN..O, EMTOTUNQ, EKUNTOTQ, # Unemployment rates
EMEBCPGS, EKCAFBALA)), # CPI rate & Balance
funs(log(.))) %>% # Log

#mutate_at(vars(c(EMM1....B, EMM2....B, EMM3....B, EMASTOT, EMECASM, EMECLBC, EMECLEM,
# EKEBCARRO, EMACECARP, EMECBALEA, EMSHRPRCF, EMCRDCONA)), funs(log(.))) %>% # Log

mutate_at(vars(EMECASM), funs(replace(., is.na(.), 0))) # Set NA values in EMECASM to zero


# Inquire position of the first and last row which contains no empty cells in array
ta.index = first(which( rowSums(is.na(ez_vardata)) == 0 ))
te.index = last(which( rowSums(is.na(ez_vardata)) == 0 ))

# Construct time range of largest complete dataset
ta = dates[ta.index]
te = dates[te.index]
sample = ta.index:te.index

# Save data to workspace
save(dates, ez_vardata, file = "ez_vardata.RData")

# Prepare data for export
out.varnames = colnames(ez_vardata)
out.dates = as.character.Date(dates[sample])
out.data = as.matrix(ez_vardata[sample, ])

# Save results to HDF5
out.file = h5file("ez_vardata.h5", mode = "w")
out.file["/varnames"] = out.varnames
out.file["/dates"] = out.dates
out.file["/data"] = out.data
h5close(out.file)




# AWM data quarterly ------------------------------------------------------
awmdata_ez = read_csv(file.path(location.data, "ez_awmdata.csv"))

# Read raw data from datastream request file
#datastream_de = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.xls"),
# sheet = "de", col_names = TRUE, na = "NA", skip = 1)


# Extract dates
dates = awmdata_ez[[1]]
ta = first(dates)
te = last(dates)

# Inquire position of the first and last row which contains no empty cells in array
ta.index = first(which( rowSums(is.na(awmdata_ez)) == 0 ))
te.index = last(which( rowSums(is.na(awmdata_ez)) == 0 ))

# Preliminary cleanup
ez_data = awmdata_ez %>%

select(-starts_with("X1")) # Remove data column

# Restrict sample to 1990Q1
index90 = which(dates == "1990Q1")
ez_data90 = ez_data[index90:te.index, ]
dates = dates[index90:te.index]

# Delete short series
indexNA = which( colSums(is.na(ez_data90[])) != 0 )
nameNA = colnames(ez_data90)[indexNA]
ez_data90 = select(ez_data90, -one_of(nameNA))

# Reset ta te
ta.index = first(which( rowSums(is.na(ez_data90)) == 0 ))
te.index = last(which( rowSums(is.na(ez_data90)) == 0 ))

ta = dates[ta.index]
te = dates[te.index]


# Do not log series which contain negative numbers
indexNEG = which( colSums((ez_data90<=0)) != 0)
nameNEG = colnames(ez_data90)[indexNEG]

# Apply transformations for stationarity
dlndata = ez_data90 %>%

mutate_at(vars(-one_of(nameNEG)),
funs(log)) %>%

mutate_at(vars(), # Ifo Inventories
funs(. - lag(.))) %>%

slice(2:te.index) # delete first row b/c diff

# Save data to workspace
save(dates, ez_data90, dlndata, file = "ez_data90_q.RData")


# Prepare data for export
out.varnames = colnames(dlndata)
out.dates = as.character.Date(dates[ta.index:te.index])
out.data = as.matrix(ez_data90[ta.index:te.index, ]) # level data
out.dlndata = as.matrix(dlndata)[,] # diffed series are one observation shorter


# Save results to HDF5
out.file = h5file("ez_data90_q.h5", mode = "w")
out.file["/varnames"] = out.varnames
out.file["/dates"] = out.dates
out.file["/data"] = out.data
out.file["/dlndata"] = out.dlndata
h5close(out.file)
138 changes: 138 additions & 0 deletions R/old_ez_data.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
# EZ data old



# VAR data monthly --------------------------------------------------------
datastream_var = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.xls"),
sheet = "var", col_names = TRUE, na = "NA", skip = 1, guess_max = 6000)


# Extract dates
source("dateShift.R")
dates = dateShift(datastream_var[[1]], unit = 'month', rule = 'start')
#ta = first(dates)
#te = last(dates)


# Build dataset
ez_vardata = datastream_var %>%

rename(EMEBCPGS = "EMEBCPGS%") %>%
rename(EMTOTUNQ = "EMTOTUN%Q") %>%
rename(EKUNTOTQ = "EKUN%TOTQ") %>%

select(-starts_with("X__")) %>% # Remove empty columns
select(-starts_with("Code")) %>% # Remove date columns
select(-c(EKIMPPRCF)) %>% # Remove short series

mutate_at(vars(-c(EMPRATE., EMIBOR3., EMIBOR1Y, EMGBOND., OIEURSW, OIEUR2W, OIEUR1M, OIEUR3M, OIEUR10, OIEUR1Y, USEURSP, # Interest rates
EKEBUN..O, EMUNPTOTO, EKEBUN..O, EMTOTUNQ, EKUNTOTQ, # Unemployment rates
EMEBCPGS, EKCAFBALA)), # CPI rate & Balance
funs(log(.))) %>% # Log

#mutate_at(vars(c(EMM1....B, EMM2....B, EMM3....B, EMASTOT, EMECASM, EMECLBC, EMECLEM,
# EKEBCARRO, EMACECARP, EMECBALEA, EMSHRPRCF, EMCRDCONA)), funs(log(.))) %>% # Log

mutate_at(vars(EMECASM), funs(replace(., is.na(.), 0))) # Set NA values in EMECASM to zero


# Inquire position of the first and last row which contains no empty cells in array
ta.index = first(which( rowSums(is.na(ez_vardata)) == 0 ))
te.index = last(which( rowSums(is.na(ez_vardata)) == 0 ))

# Construct time range of largest complete dataset
ta = dates[ta.index]
te = dates[te.index]
sample = ta.index:te.index

# Save data to workspace
save(dates, ez_vardata, file = "ez_vardata.RData")

# Prepare data for export
out.varnames = colnames(ez_vardata)
out.dates = as.character.Date(dates[sample])
out.data = as.matrix(ez_vardata[sample, ])

# Save results to HDF5
out.file = h5file("ez_vardata.h5", mode = "w")
out.file["/varnames"] = out.varnames
out.file["/dates"] = out.dates
out.file["/data"] = out.data
h5close(out.file)




# AWM data quarterly ------------------------------------------------------
awmdata_ez = read_csv(file.path(location.data, "ez_awmdata.csv"))

# Read raw data from datastream request file
#datastream_de = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.xls"),
# sheet = "de", col_names = TRUE, na = "NA", skip = 1)


# Extract dates
dates = awmdata_ez[[1]]
ta = first(dates)
te = last(dates)

# Inquire position of the first and last row which contains no empty cells in array
ta.index = first(which( rowSums(is.na(awmdata_ez)) == 0 ))
te.index = last(which( rowSums(is.na(awmdata_ez)) == 0 ))

# Preliminary cleanup
ez_data = awmdata_ez %>%

select(-starts_with("X1")) # Remove data column

# Restrict sample to 1990Q1
index90 = which(dates == "1990Q1")
ez_data90 = ez_data[index90:te.index, ]
dates = dates[index90:te.index]

# Delete short series
indexNA = which( colSums(is.na(ez_data90[])) != 0 )
nameNA = colnames(ez_data90)[indexNA]
ez_data90 = select(ez_data90, -one_of(nameNA))

# Reset ta te
ta.index = first(which( rowSums(is.na(ez_data90)) == 0 ))
te.index = last(which( rowSums(is.na(ez_data90)) == 0 ))

ta = dates[ta.index]
te = dates[te.index]


# Do not log series which contain negative numbers
indexNEG = which( colSums((ez_data90<=0)) != 0)
nameNEG = colnames(ez_data90)[indexNEG]

# Apply transformations for stationarity
dlndata = ez_data90 %>%

mutate_at(vars(-one_of(nameNEG)),
funs(log)) %>%

mutate_at(vars(), # Ifo Inventories
funs(. - lag(.))) %>%

slice(2:te.index) # delete first row b/c diff

# Save data to workspace
save(dates, ez_data90, dlndata, file = "ez_data90_q.RData")


# Prepare data for export
out.varnames = colnames(dlndata)
out.dates = as.character.Date(dates[ta.index:te.index])
out.data = as.matrix(ez_data90[ta.index:te.index, ]) # level data
out.dlndata = as.matrix(dlndata)[,] # diffed series are one observation shorter


# Save results to HDF5
out.file = h5file("ez_data90_q.h5", mode = "w")
out.file["/varnames"] = out.varnames
out.file["/dates"] = out.dates
out.file["/data"] = out.data
out.file["/dlndata"] = out.dlndata
h5close(out.file)

0 comments on commit 5fd5d36

Please sign in to comment.