Added description to data wrangling scripts. Now only do monthly data.

lbrandt · Feb 18, 2018 · 5fd5d36 · 5fd5d36
1 parent 32b8982
commit 5fd5d36
Show file tree

Hide file tree

Showing 3 changed files with 158 additions and 189 deletions.
diff --git a/MATLAB/ez_import_data.m b/MATLAB/ez_import_data.m
@@ -1,15 +1,14 @@
 % -------------------------------------------------------------------------
-% Import Eurozone data
-%
-%
-%
+% Imports Eurozone data that was previously cleaned and transformed in R
+% into MATLAB and saves balanced panel of macroeconomic series to .mat
+% workspace (version 7.3).
 %
+% Lennart Brandt, Feb 2018
 % -------------------------------------------------------------------------
 
 %clear; clc;
-%addpath('..\R;..\MATLAB')
+%addpath('..\R;..\MATLAB;')
 
-% EZ data monthly
 
 %h5disp('ez_data.h5')
 names = h5read('ez_data.h5', '/names');
@@ -41,39 +40,4 @@
 dates = dates(2:end);
 x = x(2:end, :);
 
-save ez_data names dates x
-
-
-
-
-% EZ data QUARTERLY
-%h5disp('ez_data90_q.h5')
-names = h5read('ez_data90_q.h5', '/varnames');
-dates    = datetime(h5read('ez_data90_q.h5', '/dates'), 'InputFormat', 'yyyyQQQ', 'Format', 'yyyyQQQ');
-x        = h5read('ez_data90_q.h5', '/dlndata')';
-
-[T, N]   = size(x);
-
-fprintf('Quarterly series from %s to %s \n', datestr(dates(1)), datestr(dates(end)));
-fprintf('Sample: T = %d, N = %d \n', T, N);
-
-% Format names such that they are permissible MATLAB varnames
-names = strrep(names, ' ', '_');
-names = strrep(names, '&', '_');
-names = strrep(names, '/', '_');
-names = strrep(names, '\', '_');
-names = strrep(names, ':', '');
-names = strrep(names, '.', '');
-names = strrep(names, ',', '');
-names = strrep(names, '+', '');
-names = strrep(names, '-', '');
-names = strrep(names, '<', '');
-names = strrep(names, '>', '');
-names = strrep(names, '(', '');
-names = strrep(names, ')', '');
-names = strrep(names, '’', '');
-
-% Shorten dates of diffed series
-dates = dates(2:end);
-
-save ez_data90_q names dates x
+save ez_data -v7.3 names dates x
diff --git a/R/ez_data.R b/R/ez_data.R
@@ -1,5 +1,16 @@
-# EZ data
-
+# -------------------------------------------------------------------------
+# Loads time series on Eurozone economy from Datastream request file and
+# transforms them according to accompanying meta data file. Saves results
+# to HDF5 container for further use.
+#
+# Lennart Brandt, Feb 2018
+# -------------------------------------------------------------------------
+
+# Initialize CRAN packages
+
+#install.packages("tidyverse")
+#install.packages("readxl")
+#install.packages("h5")
 require(tidyverse)
 require(readxl)
 require(h5)
@@ -18,7 +29,7 @@ setwd(location.thisfile)
 # Set location of data relative to working directory which contains this script
 location.data = normalizePath(file.path("..", "..", "..", "Data"), winslash = "/")
 
-
+View(ez_data$EMGBOND.)
 
 
 # EZ data monthly --------------------------------------------------------
@@ -29,8 +40,6 @@ datastream_ez = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.
 # Extract dates
 source("dateShift.R")
 dates = dateShift(datastream_ez[[1]], unit = 'month', rule = 'start')
-#ta = first(dates)
-#te = last(dates)
 
 
 # Build dataset
@@ -50,8 +59,6 @@ for(i in 1:dim(ez_data)[2]){
   attr(ez_data, "transformation")[i] = ez_meta$transform[which(ez_meta$code == names(ez_data)[i])]
 }
 
-attributes(ez_data)$transformation
-
 ez_tdata = ez_data %>%
   mutate_if(.predicate = attributes(ez_data)$transformation == 1, funs(. - lag(.))) %>% # Diffs
   mutate_if(.predicate = attributes(ez_data)$transformation == 2, funs(log(.) - lag(log(.)))) # Logdiffs
@@ -84,143 +91,3 @@ out.file["/dates"] = out.dates
 out.file["/data"]  = out.data
 out.file["/tdata"] = out.tdata
 h5close(out.file)
-
-
-
-
-
-
-# VAR data monthly --------------------------------------------------------
-datastream_var = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.xls"), 
-                            sheet = "var", col_names = TRUE, na = "NA", skip = 1, guess_max = 6000)
-
-
-# Extract dates
-source("dateShift.R")
-dates = dateShift(datastream_var[[1]], unit = 'month', rule = 'start')
-#ta = first(dates)
-#te = last(dates)
-
-
-# Build dataset
-ez_vardata = datastream_var %>%
-
-  rename(EMEBCPGS = "EMEBCPGS%") %>%
-  rename(EMTOTUNQ = "EMTOTUN%Q") %>%
-  rename(EKUNTOTQ = "EKUN%TOTQ") %>%
-
-  select(-starts_with("X__")) %>% # Remove empty columns
-  select(-starts_with("Code")) %>% # Remove date columns
-  select(-c(EKIMPPRCF)) %>% # Remove short series
-
-  mutate_at(vars(-c(EMPRATE., EMIBOR3., EMIBOR1Y, EMGBOND., OIEURSW, OIEUR2W, OIEUR1M, OIEUR3M, OIEUR10, OIEUR1Y, USEURSP, # Interest rates
-                    EKEBUN..O, EMUNPTOTO, EKEBUN..O, EMTOTUNQ, EKUNTOTQ, # Unemployment rates
-                    EMEBCPGS, EKCAFBALA)), # CPI rate & Balance 
-            funs(log(.))) %>% # Log
-
-  #mutate_at(vars(c(EMM1....B, EMM2....B, EMM3....B, EMASTOT, EMECASM, EMECLBC, EMECLEM,
-  #                 EKEBCARRO, EMACECARP, EMECBALEA, EMSHRPRCF, EMCRDCONA)), funs(log(.))) %>% # Log
-
-  mutate_at(vars(EMECASM), funs(replace(., is.na(.), 0))) # Set NA values in EMECASM to zero
-
-
-# Inquire position of the first and last row which contains no empty cells in array
-ta.index = first(which( rowSums(is.na(ez_vardata)) == 0 ))
-te.index = last(which( rowSums(is.na(ez_vardata)) == 0 ))
-
-# Construct time range of largest complete dataset
-ta = dates[ta.index]
-te = dates[te.index]
-sample = ta.index:te.index
-
-# Save data to workspace
-save(dates, ez_vardata, file = "ez_vardata.RData")
-
-# Prepare data for export
-out.varnames  = colnames(ez_vardata)
-out.dates     = as.character.Date(dates[sample])
-out.data      = as.matrix(ez_vardata[sample, ])
-
-# Save results to HDF5
-out.file = h5file("ez_vardata.h5", mode = "w")
-out.file["/varnames"] = out.varnames
-out.file["/dates"] = out.dates
-out.file["/data"] = out.data
-h5close(out.file)
-
-
-
-
-# AWM data quarterly ------------------------------------------------------
-awmdata_ez = read_csv(file.path(location.data, "ez_awmdata.csv"))
-
-# Read raw data from datastream request file
-#datastream_de = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.xls"), 
-#                           sheet = "de", col_names = TRUE, na = "NA", skip = 1)
-
-
-# Extract dates
-dates = awmdata_ez[[1]]
-ta = first(dates)
-te = last(dates)
-
-# Inquire position of the first and last row which contains no empty cells in array
-ta.index = first(which( rowSums(is.na(awmdata_ez)) == 0 ))
-te.index = last(which( rowSums(is.na(awmdata_ez)) == 0 ))
-
-# Preliminary cleanup
-ez_data = awmdata_ez %>%
-
-  select(-starts_with("X1")) # Remove data column
-
-# Restrict sample to 1990Q1
-index90 = which(dates == "1990Q1")
-ez_data90 = ez_data[index90:te.index, ]
-dates = dates[index90:te.index]
-
-# Delete short series
-indexNA = which( colSums(is.na(ez_data90[])) != 0 )
-nameNA = colnames(ez_data90)[indexNA]
-ez_data90 = select(ez_data90, -one_of(nameNA))
-
-# Reset ta te
-ta.index = first(which( rowSums(is.na(ez_data90)) == 0 ))
-te.index = last(which( rowSums(is.na(ez_data90)) == 0 ))
-
-ta = dates[ta.index]
-te = dates[te.index]
-
-
-# Do not log series which contain negative numbers
-indexNEG = which( colSums((ez_data90<=0)) != 0)
-nameNEG = colnames(ez_data90)[indexNEG]
-
-# Apply transformations for stationarity
-dlndata = ez_data90 %>%
-
-  mutate_at(vars(-one_of(nameNEG)),
-            funs(log)) %>%
-
-  mutate_at(vars(), # Ifo Inventories
-            funs(. - lag(.))) %>%
-
-  slice(2:te.index) # delete first row b/c diff
-
-# Save data to workspace
-save(dates, ez_data90, dlndata, file = "ez_data90_q.RData")
-
-
-# Prepare data for export
-out.varnames  = colnames(dlndata)
-out.dates     = as.character.Date(dates[ta.index:te.index])
-out.data      = as.matrix(ez_data90[ta.index:te.index, ]) # level data
-out.dlndata   = as.matrix(dlndata)[,] # diffed series are one observation shorter
-
-
-# Save results to HDF5
-out.file = h5file("ez_data90_q.h5", mode = "w")
-out.file["/varnames"] = out.varnames
-out.file["/dates"] = out.dates
-out.file["/data"] = out.data
-out.file["/dlndata"] = out.dlndata
-h5close(out.file)
diff --git a/R/old_ez_data.R b/R/old_ez_data.R
@@ -0,0 +1,138 @@
+# EZ data old
+
+
+
+# VAR data monthly --------------------------------------------------------
+datastream_var = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.xls"), 
+                            sheet = "var", col_names = TRUE, na = "NA", skip = 1, guess_max = 6000)
+
+
+# Extract dates
+source("dateShift.R")
+dates = dateShift(datastream_var[[1]], unit = 'month', rule = 'start')
+#ta = first(dates)
+#te = last(dates)
+
+
+# Build dataset
+ez_vardata = datastream_var %>%
+
+  rename(EMEBCPGS = "EMEBCPGS%") %>%
+  rename(EMTOTUNQ = "EMTOTUN%Q") %>%
+  rename(EKUNTOTQ = "EKUN%TOTQ") %>%
+
+  select(-starts_with("X__")) %>% # Remove empty columns
+  select(-starts_with("Code")) %>% # Remove date columns
+  select(-c(EKIMPPRCF)) %>% # Remove short series
+
+  mutate_at(vars(-c(EMPRATE., EMIBOR3., EMIBOR1Y, EMGBOND., OIEURSW, OIEUR2W, OIEUR1M, OIEUR3M, OIEUR10, OIEUR1Y, USEURSP, # Interest rates
+                    EKEBUN..O, EMUNPTOTO, EKEBUN..O, EMTOTUNQ, EKUNTOTQ, # Unemployment rates
+                    EMEBCPGS, EKCAFBALA)), # CPI rate & Balance 
+            funs(log(.))) %>% # Log
+
+  #mutate_at(vars(c(EMM1....B, EMM2....B, EMM3....B, EMASTOT, EMECASM, EMECLBC, EMECLEM,
+  #                 EKEBCARRO, EMACECARP, EMECBALEA, EMSHRPRCF, EMCRDCONA)), funs(log(.))) %>% # Log
+
+  mutate_at(vars(EMECASM), funs(replace(., is.na(.), 0))) # Set NA values in EMECASM to zero
+
+
+# Inquire position of the first and last row which contains no empty cells in array
+ta.index = first(which( rowSums(is.na(ez_vardata)) == 0 ))
+te.index = last(which( rowSums(is.na(ez_vardata)) == 0 ))
+
+# Construct time range of largest complete dataset
+ta = dates[ta.index]
+te = dates[te.index]
+sample = ta.index:te.index
+
+# Save data to workspace
+save(dates, ez_vardata, file = "ez_vardata.RData")
+
+# Prepare data for export
+out.varnames  = colnames(ez_vardata)
+out.dates     = as.character.Date(dates[sample])
+out.data      = as.matrix(ez_vardata[sample, ])
+
+# Save results to HDF5
+out.file = h5file("ez_vardata.h5", mode = "w")
+out.file["/varnames"] = out.varnames
+out.file["/dates"] = out.dates
+out.file["/data"] = out.data
+h5close(out.file)
+
+
+
+
+# AWM data quarterly ------------------------------------------------------
+awmdata_ez = read_csv(file.path(location.data, "ez_awmdata.csv"))
+
+# Read raw data from datastream request file
+#datastream_de = read_excel(file.path(location.data, "DATASTREAM_REQUEST_MONTHLY.xls"), 
+#                           sheet = "de", col_names = TRUE, na = "NA", skip = 1)
+
+
+# Extract dates
+dates = awmdata_ez[[1]]
+ta = first(dates)
+te = last(dates)
+
+# Inquire position of the first and last row which contains no empty cells in array
+ta.index = first(which( rowSums(is.na(awmdata_ez)) == 0 ))
+te.index = last(which( rowSums(is.na(awmdata_ez)) == 0 ))
+
+# Preliminary cleanup
+ez_data = awmdata_ez %>%
+
+  select(-starts_with("X1")) # Remove data column
+
+# Restrict sample to 1990Q1
+index90 = which(dates == "1990Q1")
+ez_data90 = ez_data[index90:te.index, ]
+dates = dates[index90:te.index]
+
+# Delete short series
+indexNA = which( colSums(is.na(ez_data90[])) != 0 )
+nameNA = colnames(ez_data90)[indexNA]
+ez_data90 = select(ez_data90, -one_of(nameNA))
+
+# Reset ta te
+ta.index = first(which( rowSums(is.na(ez_data90)) == 0 ))
+te.index = last(which( rowSums(is.na(ez_data90)) == 0 ))
+
+ta = dates[ta.index]
+te = dates[te.index]
+
+
+# Do not log series which contain negative numbers
+indexNEG = which( colSums((ez_data90<=0)) != 0)
+nameNEG = colnames(ez_data90)[indexNEG]
+
+# Apply transformations for stationarity
+dlndata = ez_data90 %>%
+
+  mutate_at(vars(-one_of(nameNEG)),
+            funs(log)) %>%
+
+  mutate_at(vars(), # Ifo Inventories
+            funs(. - lag(.))) %>%
+
+  slice(2:te.index) # delete first row b/c diff
+
+# Save data to workspace
+save(dates, ez_data90, dlndata, file = "ez_data90_q.RData")
+
+
+# Prepare data for export
+out.varnames  = colnames(dlndata)
+out.dates     = as.character.Date(dates[ta.index:te.index])
+out.data      = as.matrix(ez_data90[ta.index:te.index, ]) # level data
+out.dlndata   = as.matrix(dlndata)[,] # diffed series are one observation shorter
+
+
+# Save results to HDF5
+out.file = h5file("ez_data90_q.h5", mode = "w")
+out.file["/varnames"] = out.varnames
+out.file["/dates"] = out.dates
+out.file["/data"] = out.data
+out.file["/dlndata"] = out.dlndata
+h5close(out.file)