Address PR comments

jeffreylovitz · jeffreylovitz · commit bfee4c8360b8 · 2021-02-19T15:34:21.000-05:00
diff --git a/redisgraph_bulk_loader/bulk_update.py b/redisgraph_bulk_loader/bulk_update.py
@@ -2,54 +2,47 @@
 import csv
 import redis
 import click
-from redis import ResponseError
+from redisgraph import Graph
 from timeit import default_timer as timer
 
 
 def utf8len(s):
     return len(s.encode('utf-8'))
 
 
+# Count number of rows in file.
+def count_entities(filename):
+    entities_count = 0
+    with open(filename, 'rt') as f:
+        entities_count = sum(1 for line in f)
+    return entities_count
+
+
 class BulkUpdate:
     """Handler class for emitting bulk update commands"""
-    def __init__(self, graph, max_token_size, separator, no_header, filename, query, variable_name, client):
+    def __init__(self, graph_name, max_token_size, separator, no_header, filename, query, variable_name, client):
         self.separator = separator
         self.no_header = no_header
         self.query = " ".join(["UNWIND $rows AS", variable_name, query])
         self.buffer_size = 0
         self.max_token_size = max_token_size * 1024 * 1024 - utf8len(self.query)
-        self.graph = graph
         self.filename = filename
-        self.client = client
+        self.graph_name = graph_name
+        self.graph = Graph(graph_name, client)
         self.statistics = {}
 
-    # Count number of rows in file.
-    def count_entities(self):
-        entities_count = 0
-        with open(self.filename, 'rt') as f:
-            entities_count = sum(1 for line in f)
-        return entities_count
-
     def update_statistics(self, result):
-        for raw_stat in result[0]:
-            stat = raw_stat.split(": ")
-            key = stat[0]
+        for key, new_val in result.statistics.items():
             try:
                 val = self.statistics[key]
             except KeyError:
                 val = 0
-            val += float(stat[1].split(" ")[0])
+            val += new_val
             self.statistics[key] = val
 
     def emit_buffer(self, rows):
         command = " ".join([rows, self.query])
-        try:
-            result = self.client.execute_command("GRAPH.QUERY", self.graph, command)
-        except ResponseError as e:
-            raise e
-        # If we encountered a run-time error, the last response element will be an exception.
-        if isinstance(result[-1], ResponseError):
-            raise result[-1]
+        result = self.graph.query(command)
         self.update_statistics(result)
 
     def quote_string(self, cell):
@@ -65,8 +58,14 @@ def quote_string(self, cell):
                 cell = "".join(["\"", cell, "\""])
         return cell
 
+    # Raise an exception if the query triggers a compile-time error
+    def validate_query(self):
+        command = " ".join(["CYPHER rows=[]", self.query])
+        # The plan call will raise an error if the query is malformed or invalid.
+        self.graph.execution_plan(command)
+
     def process_update_csv(self):
-        entity_count = self.count_entities()
+        entity_count = count_entities(self.filename)
 
         with open(self.filename, 'rt') as f:
             if self.no_header is False:
@@ -75,7 +74,7 @@ def process_update_csv(self):
             reader = csv.reader(f, delimiter=self.separator, skipinitialspace=True, quoting=csv.QUOTE_NONE, escapechar='\\')
 
             rows_strs = []
-            with click.progressbar(reader, length=entity_count, label=self.graph) as reader:
+            with click.progressbar(reader, length=entity_count, label=self.graph_name) as reader:
                 for row in reader:
                     # Prepare the string representation of the current row.
                     row = ",".join([self.quote_string(cell) for cell in row])
@@ -145,6 +144,7 @@ def bulk_update(graph, host, port, password, unix_socket_path, query, variable_n
         pass
 
     updater = BulkUpdate(graph, max_token_size, separator, no_header, csv, query, variable_name, client)
+    updater.validate_query()
     updater.process_update_csv()
 
     end_time = timer()
diff --git a/test/test_bulk_update.py b/test/test_bulk_update.py
@@ -97,10 +97,35 @@ def test02_traversal_updates(self):
                            ["c", "c2"]]
         self.assertEqual(query_result.result_set, expected_result)
 
-    def test03_custom_delimiter(self):
-        """Validate that non-comma delimiters produce the correct results."""
+    def test03_datatypes(self):
+        """Validate that all RedisGraph datatypes are supported by the bulk updater."""
         graphname = "tmpgraph2"
         # Write temporary files
+        with open('/tmp/csv.tmp', mode='w') as csv_file:
+            out = csv.writer(csv_file)
+            out.writerow([0, 1.5, "true", "string", "[1, 'nested_str']"])
+
+        runner = CliRunner()
+        res = runner.invoke(bulk_update, ['--csv', '/tmp/csv.tmp',
+                                          '--query', 'CREATE (a:L) SET a.intval = row[0], a.doubleval = row[1], a.boolval = row[2], a.stringval = row[3], a.arrayval = row[4]',
+                                          '--no-header',
+                                          graphname], catch_exceptions=False)
+
+        self.assertEqual(res.exit_code, 0)
+        self.assertIn('Nodes created: 1', res.output)
+        self.assertIn('Properties set: 5', res.output)
+
+        tmp_graph = Graph(graphname, self.redis_con)
+        query_result = tmp_graph.query('MATCH (a) RETURN a.intval, a.doubleval, a.boolval, a.stringval, a.arrayval')
+
+        # Validate that the expected results are all present in the graph
+        expected_result = [[0, 1.5, True, "string", "[1,'nested_str']"]]
+        self.assertEqual(query_result.result_set, expected_result)
+
+    def test04_custom_delimiter(self):
+        """Validate that non-comma delimiters produce the correct results."""
+        graphname = "tmpgraph3"
+        # Write temporary files
         with open('/tmp/csv.tmp', mode='w') as csv_file:
             out = csv.writer(csv_file, delimiter='|')
             out.writerow(["id", "name"])
@@ -140,7 +165,7 @@ def test03_custom_delimiter(self):
         self.assertNotIn('Nodes created', res.output)
         self.assertNotIn('Properties set', res.output)
 
-    def test04_custom_variable_name(self):
+    def test05_custom_variable_name(self):
         """Validate that the user can specify the name of the 'row' query variable."""
         graphname = "variable_name"
         runner = CliRunner()
@@ -178,9 +203,9 @@ def test04_custom_variable_name(self):
                            ['Valerie Abigail Arad', 31, 'female', 'married']]
         self.assertEqual(query_result.result_set, expected_result)
 
-    def test05_no_header(self):
+    def test06_no_header(self):
         """Validate that the '--no-header' option works properly."""
-        graphname = "tmpgraph3"
+        graphname = "tmpgraph4"
         # Write temporary files
         with open('/tmp/csv.tmp', mode='w') as csv_file:
             out = csv.writer(csv_file)
@@ -208,7 +233,7 @@ def test05_no_header(self):
                            [5, "b"]]
         self.assertEqual(query_result.result_set, expected_result)
 
-    def test06_batched_update(self):
+    def test07_batched_update(self):
         """Validate that updates performed over multiple batches produce the correct results."""
         graphname = "batched_update"
 
@@ -238,9 +263,9 @@ def test06_batched_update(self):
         expected_result = [[prop_str]]
         self.assertEqual(query_result.result_set, expected_result)
 
-    def test07_runtime_error(self):
+    def test08_runtime_error(self):
         """Validate that run-time errors are captured by the bulk updater."""
-        graphname = "tmpgraph1"
+        graphname = "tmpgraph5"
 
         # Write temporary files
         with open('/tmp/csv.tmp', mode='w') as csv_file:
@@ -255,9 +280,21 @@ def test07_runtime_error(self):
         self.assertNotEqual(res.exit_code, 0)
         self.assertIn("Cannot merge node", str(res.exception))
 
-    def test07_invalid_inputs(self):
+    def test09_compile_time_error(self):
+        """Validate that malformed queries trigger an early exit from the bulk updater."""
+        graphname = "tmpgraph5"
+        runner = CliRunner()
+        res = runner.invoke(bulk_update, ['--csv', '/tmp/csv.tmp',
+                                          '--query', 'CREATE (:L {val: row[0], val2: undefined_identifier})',
+                                          '--no-header',
+                                          graphname])
+
+        self.assertNotEqual(res.exit_code, 0)
+        self.assertIn("undefined_identifier not defined", str(res.exception))
+
+    def test10_invalid_inputs(self):
         """Validate that the bulk updater handles invalid inputs incorrectly."""
-        graphname = "tmpgraph1"
+        graphname = "tmpgraph6"
 
         # Attempt to insert a non-existent CSV file.
         runner = CliRunner()
@@ -267,21 +304,3 @@ def test07_invalid_inputs(self):
 
         self.assertNotEqual(res.exit_code, 0)
         self.assertIn("No such file", str(res.exception))
-
-        # Write temporary files
-        with open('/tmp/csv.tmp', mode='w') as csv_file:
-            out = csv.writer(csv_file)
-            out.writerow(["id", "name"])
-            out.writerow([0, "a"])
-            out.writerow([5, "b"])
-            out.writerow([3, "c"])
-
-        # Attempt to access a non-existent column.
-        res = runner.invoke(bulk_update, ['--csv', '/tmp/csv.tmp',
-                                          '--query', 'CREATE (:L {val: row[3]})',
-                                          graphname])
-
-        #  self.assertNotEqual(res.exit_code, 0)
-        #  import ipdb
-        #  ipdb.set_trace()
-        #  self.assertIn("No such file", str(res.exception))