worldbank · sebxwolf · May 12, 2020 · Apr 30, 2020 · Apr 30, 2020 · Apr 30, 2020
diff --git a/cdr-aggregation/config_file_template_hive.py b/cdr-aggregation/config_file_template_hive.py
@@ -0,0 +1,25 @@
+from pyspark.sql.types import *
+schema = StructType([
+  StructField("msisdn", IntegerType(), True),
+  StructField("call_datetime", StringType(), True), #load as string, will be turned into datetime in standardize_csv_files()
+  StructField("location_id", StringType(), True)
+])
+
+datasource_configs = {
+  "base_path": "path_to_folder/data", #folder path used in this docker env
+  "hive_warehouse_location": "path_to_hive_warehouse",
+  "spark_mode": 'hive',
+  "hive_vars":{ 'msisdn' : 'col1',
+                'call_datetime': 'col2',
+                'location_id': 'col3',
+                'calls': 'table'},
+  "country_code": "",
+  "telecom_alias": "",
+  "schema" : schema,
+  "data_paths" : ["*.csv"],
+  "filestub": "",
+  "geofiles": {},
+  "shapefiles": ['admin2','admin3', 'voronoi'],
+  "dates": {'start_date' : dt.datetime(2020,2,1),
+            'end_date' : dt.datetime(2020,3,31)}
+}
diff --git a/cdr-aggregation/notebooks/agregation_offsite.ipynb b/cdr-aggregation/notebooks/agregation_offsite.ipynb
@@ -42,8 +42,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "%load_ext autoreload\n",
-    "%autoreload 2"
+    "# %load_ext autoreload\n",
+    "# %autoreload 2"
    ]
   },
   {
@@ -52,7 +52,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "from modules.setup import *"
+    "from modules.DataSource import *"
    ]
   },
   {
@@ -61,21 +61,26 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "spark"
+    "config_file = '../config_file.py'"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "# Import data"
+    "exec(open(config_file).read())"
    ]
   },
   {
-   "cell_type": "markdown",
+   "cell_type": "code",
+   "execution_count": null,
    "metadata": {},
+   "outputs": [],
    "source": [
-    "## Set up the configuration for data standardization"
+    "ds = DataSource(datasource_configs)\n",
+    "ds.show_config()"
    ]
   },
   {
@@ -84,33 +89,28 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "config_file = '../config_file.py'"
+    "from modules.setup import *"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "exec(open(config_file).read())"
+    "# Import data"
    ]
   },
   {
-   "cell_type": "code",
-   "execution_count": null,
+   "cell_type": "markdown",
    "metadata": {},
-   "outputs": [],
    "source": [
-    "ds = DataSource(datasource_configs)\n",
-    "ds.show_config()"
+    "## Load CDR data"
    ]
   },
   {
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "## Standardize raw csv files"
+    "### Process/standardize raw data, save as parquet, and then load it"
    ]
   },
   {
@@ -132,6 +132,35 @@
     "#ds.load_standardized_parquet_file()"
    ]
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Alternatively, specify and load hive table"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Specify and load hive data\n",
+    "ds.parquet_df = ds.spark.sql(\"\"\"SELECT {} AS msisdn, \n",
+    "                                       {} AS call_datetime, \n",
+    "                                       {} AS location_id FROM {}\"\"\".format(ds.hive_vars['msisdn'],\n",
+    "                                                                           ds.hive_vars['call_datetime'],\n",
+    "                                                                           ds.hive_vars['location_id'],\n",
+    "                                                                           ds.hive_vars['calls']))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Or load a sample file"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -140,9 +169,9 @@
    "source": [
     "## Use this in case you want to sample the data and run the code on the sample\n",
     "\n",
-    "#ds.sample_and_save(number_of_ids=1000)\n",
-    "ds.load_sample('sample_feb_mar2020')\n",
-    "ds.parquet_df = ds.sample_df"
+    "# #ds.sample_and_save(number_of_ids=1000)\n",
+    "# ds.load_sample('sample_feb_mar2020')\n",
+    "# ds.parquet_df = ds.sample_df"
    ]
   },
   {
@@ -310,7 +339,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "!jupyter nbconvert --to script *.ipynb"
+    "#!jupyter nbconvert --to script *.ipynb"
    ]
   }
  ],

diff --git a/cdr-aggregation/notebooks/agregation_offsite.py b/cdr-aggregation/notebooks/agregation_offsite.py
@@ -27,49 +27,42 @@
 
 # # Import code
 
-# In[ ]:
-
+# In[2]:
 
-get_ipython().run_line_magic('load_ext', 'autoreload')
-get_ipython().run_line_magic('autoreload', '2')
 
+from modules.DataSource import *
 
-# In[ ]:
 
-
-from modules.setup import *
+# In[3]:
 
 
-# In[ ]:
-
-
-spark
+config_file = '../config_file.py'
 
 
-# # Import data
+# In[4]:
 
-# ## Set up the configuration for data standardization
 
-# In[ ]:
+exec(open(config_file).read())
 
 
-config_file = '../config_file.py'
+# In[10]:
 
 
-# In[ ]:
+ds = DataSource(datasource_configs)
+ds.show_config()
 
 
-exec(open(config_file).read())
+# In[11]:
 
 
-# In[ ]:
+from modules.setup import *
 
 
-ds = DataSource(datasource_configs)
-ds.show_config()
+# # Import data
 
+# ## Load CDR data
 
-# ## Standardize raw csv files
+# ### Process/standardize raw data, save as parquet, and then load it
 
 # In[ ]:
 
@@ -84,19 +77,35 @@
 #ds.load_standardized_parquet_file()
 
 
+# ### Alternatively, specify and load hive table
+
 # In[ ]:
 
 
+# Specify and load hive data
+ds.parquet_df = ds.spark.sql("""SELECT {} AS msisdn, 
+                                       {} AS call_datetime, 
+                                       {} AS location_id FROM {}""".format(ds.hive_vars['msisdn'],
+                                                                           ds.hive_vars['call_datetime'],
+                                                                           ds.hive_vars['location_id'],
+                                                                           ds.hive_vars['calls']))
+
+
+# ### Or load a sample file
+
+# In[14]:
+
+
 ## Use this in case you want to sample the data and run the code on the sample
 
-#ds.sample_and_save(number_of_ids=1000)
-ds.load_sample('sample_feb_mar2020')
-ds.parquet_df = ds.sample_df
+# #ds.sample_and_save(number_of_ids=1000)
+# ds.load_sample('sample_feb_mar2020')
+# ds.parquet_df = ds.sample_df
 
 
 # ## Load geo data
 
-# In[ ]:
+# In[13]:
 
 
 ds.load_geo_csvs()
@@ -153,7 +162,7 @@
 
 # ## Priority indicators for admin2
 
-# In[ ]:
+# In[15]:
 
 
 agg_custom = custom_aggregator(result_stub = '/admin2/custom',
@@ -186,11 +195,3 @@
 
 agg_custom.attempt_aggregation()
 
-
-# # Produce script
-
-# In[ ]:
-
-
-get_ipython().system('jupyter nbconvert --to script *.ipynb')
-