Map seed file when generating inputs

p1erce2023 · p1erce2023 · commit 088173f7d537 · 2019-06-18T23:33:40.000+08:00
diff --git a/benchmark/redisgraph/generate_graph500_inputs.py b/benchmark/redisgraph/generate_graph500_inputs.py
@@ -1,13 +1,12 @@
 import os
 import sys
-import string
 
 # Read the node input file and translate the input IDs into a contiguous range.
 # Then, read the relation input file and translate all source and destination node IDs
 # to their updated contiguous values.
 
 # User-provided input data directory
-if len(sys.argv) < 2 or os.path.exists(sys.argv[1]) == False:
+if len(sys.argv) < 2 or os.path.exists(sys.argv[1]) is False:
     print("Usage: generate_inputs.py [path_to_inputs]")
     exit(1)
 
@@ -16,6 +15,7 @@
 # Input filenames
 nodefile = 'graph500-22_unique_node'
 relfile = 'graph500-22'
+seedfile = 'graph500-22-seed'
 
 # Output data directory
 datadir = 'data'
@@ -26,30 +26,21 @@
 except OSError:
     pass
 
-# Count the number of unique nodes in the data set
-num_nodes = sum(1 for line in open(os.path.join(inputdir, nodefile)))
-
 updated_id = 0
 
 updated_node_file = open(os.path.join(datadir, nodefile), 'w')
 updated_node_file.write('id\n') # Output a header row
 updated_relation_file = open(os.path.join(datadir, relfile), 'w')
-
-# Scan the node file to find the highest node ID
-max_node = -1
-with open(os.path.join(inputdir, nodefile)) as f:
-    for line in f:
-        max_node = max(max_node, int(line))
+updated_seed_file = open(os.path.join(datadir, seedfile), 'w')
 
 # Map every node ID to its line number
 # and generate an updated node file.
-placement = [0]*(max_node + 1)
+placement = {}
 with open(os.path.join(inputdir, nodefile)) as f:
     for line in f:
-        node = int(line)
-        placement[node] = updated_id
-        updated_id += 1
+        placement[int(line)] = updated_id
         updated_node_file.write('%d\n' % (updated_id))
+        updated_id += 1
 
 with open(os.path.join(inputdir, relfile)) as f:
     for line in f:
@@ -63,5 +54,10 @@
         # Output the updated edge description
         updated_relation_file.write("%d,%d\n" % (a, b))
 
+with open(os.path.join(inputdir, seedfile)) as f:
+    updated_seed_file.write(' '.join(str(placement[int(i)]) for i in f.read().split()))
+
+
 updated_node_file.close()
 updated_relation_file.close()
+updated_seed_file.close()
diff --git a/benchmark/redisgraph/generate_twitter_inputs.py b/benchmark/redisgraph/generate_twitter_inputs.py
@@ -1,22 +1,21 @@
 import os
 import sys
-import string
 
 # Read the node input file and translate the input IDs into a contiguous range.
 # Then, read the relation input file and translate all source and destination node IDs
 # to their updated contiguous values.
 
 # User-provided input data directory
-if len(sys.argv) < 2 or os.path.exists(sys.argv[1]) == False:
+if len(sys.argv) < 2 or os.path.exists(sys.argv[1]) is False:
     print("Usage: generate_inputs.py [path_to_inputs]")
     exit(1)
 
 inputdir = sys.argv[1]
 
 # Input filenames
 nodefile = 'twitter_rv.net_unique_node'
-nodefile_out = 'twitter_rv_net_unique_node'
-relfile = 'twitter_rv'
+relfile = 'twitter_rv.net'
+seedfile = 'twitter_rv.net-seed'
 
 # Output data directory
 datadir = 'data'
@@ -27,30 +26,21 @@
 except OSError:
     pass
 
-# Count the number of unique nodes in the data set
-num_nodes = sum(1 for line in open(os.path.join(inputdir, nodefile)))
-
 updated_id = 0
 
-updated_node_file = open(os.path.join(datadir, nodefile_out), 'w')
+updated_node_file = open(os.path.join(datadir, nodefile.replace('.', '_')), 'w')
 updated_node_file.write('id\n') # Output a header row
-updated_relation_file = open(os.path.join(datadir, relfile), 'w')
-
-# Scan the node file to find the highest node ID
-max_node = -1
-with open(os.path.join(inputdir, nodefile)) as f:
-    for line in f:
-        max_node = max(max_node, int(line))
+updated_relation_file = open(os.path.join(datadir, relfile.replace('.', '_')), 'w')
+updated_seed_file = open(os.path.join(datadir, seedfile.replace('.', '_')), 'w')
 
 # Map every node ID to its line number
 # and generate an updated node file.
-placement = [0]*(max_node + 1)
+placement = {}
 with open(os.path.join(inputdir, nodefile)) as f:
     for line in f:
-        node = int(line)
-        placement[node] = updated_id
-        updated_id += 1
+        placement[int(line)] = updated_id
         updated_node_file.write('%d\n' % (updated_id))
+        updated_id += 1
 
 with open(os.path.join(inputdir, relfile)) as f:
     for line in f:
@@ -64,5 +54,10 @@
         # Output the updated edge description
         updated_relation_file.write("%d,%d\n" % (a, b))
 
+with open(os.path.join(inputdir, seedfile)) as f:
+    updated_seed_file.write(' '.join(str(placement[int(i)]) for i in f.read().split()))
+
+
 updated_node_file.close()
 updated_relation_file.close()
+updated_seed_file.close()
diff --git a/benchmark/redisgraph/kn.py b/benchmark/redisgraph/kn.py
@@ -9,7 +9,6 @@
 
 import sys
 import os
-import threading
 import click
 import multiprocessing
 from query_runner import *
@@ -18,6 +17,7 @@
 # Global, map of reports.
 seedReports = {}
 
+
 #####################################################################
 # Initialize seed reporting,
 # seedReports[seed][iterations] contains the number of iterations required for seed
@@ -28,10 +28,11 @@ def InitSeedReports(seeds, iterations):
     for s in seeds:
         seedReports[s] = []
 
+
 #####################################################################
 # Generate a report summary.
 #######################################################################
-def FinalizeReport(depth, threads):
+def FinalizeReport(graphid, depth, threads):
     global seedReports
     # seed=19284, k=1, runId=0, avgNeighbor=91.0, execTime=0.197093009949
     # AVG Seed iterations.
@@ -42,14 +43,21 @@ def FinalizeReport(depth, threads):
     threadsTotalRuntime = [0] * threads
     runs = 0
 
+    # map to raw seed id
+    raw_seeds = []
+    if os.path.exists(graphid + '_unique_node'):
+        for line in open(graphid + '_unique_node'):
+            raw_seeds.append(line.strip())
+
     for seed in seedReports:
+        seed_raw = raw_seeds[int(seed)]
         report = seedReports[seed]
         for iterationReport in report:
             avgNeighbor = iterationReport['avgN']
             execTime = iterationReport['totalTime']
             threadId = iterationReport['threadId']
             threadsTotalRuntime[threadId] += execTime
-            output += "seed=%s, k=%d, avgNeighbor=%d, execTime=%f[ms]\r\n" %(seed, depth, avgNeighbor, execTime)
+            output += "seed=%s, k=%d, avgNeighbor=%d, execTime=%f[ms]\r\n" %(seed_raw, depth, avgNeighbor, execTime)
             output += "**************************************************************\r\n"
 
             avgKNSize += avgNeighbor
@@ -69,6 +77,7 @@ def FinalizeReport(depth, threads):
 
     return output
 
+
 #####################################################################
 # K-hop-path-neighbor-count benchmark workload.
 # (1) read prepared random nodes from a seed file under seed folder.
@@ -87,6 +96,7 @@ def GetSeeds(seed_file_path, count):
             print("Seed file does not contain enough seeds.")
             sys.exit()
 
+
 ###############################################################
 # function: thread worker, pull work item from pool
 # and execute query via runner
@@ -97,7 +107,7 @@ def RunKNLatencyThread(graphid, threadId, depth, provider, label, seedPool, repo
     elif provider == "tigergraph":
         runner = TigerGraphQueryRunner()
     else:
-        print "Unknown runner %s, quiting" % provider
+        print("Unknown runner %s, quiting" % provider)
         sys.exit()
 
     # As long as there's work to do...
@@ -132,23 +142,24 @@ def RunKNLatencyThread(graphid, threadId, depth, provider, label, seedPool, repo
         iterationSummary['totalTime'] = iterationTime
         reportQueue.put(iterationSummary, False)
 
+
 ###############################################################
 # function: check the total latency for k-hop-path neighbor count
 # query for a given set of seeds.
 ################################################################
 @click.command()
-@click.option('--graphid', '-g', default='graph500', help="graph id")
-@click.option('--seedfile', '-s', default='./seeds', help="seed file")
+@click.option('--graphid', '-g', default='graph500-22',
+              type=click.Choice(['graph500-22', 'twitter_rv_net']), help="graph id")
 @click.option('--count', '-c', default=20, help="number of seeds")
 @click.option('--depth', '-d', default=1, help="number of hops to perform")
 @click.option('--provider', '-p', default='redisgraph', help="graph identifier")
 @click.option('--label', '-l', default='label', help="node label")
 @click.option('--threads', '-t', default=2, help="number of querying threads")
 @click.option('--iterations', '-i', default=10, help="number of iterations per query")
-def RunKNLatency(graphid, seedfile, count, depth, provider, label, threads, iterations):
+def RunKNLatency(graphid, count, depth, provider, label, threads, iterations):
     #create result folder
     global seedReports
-
+    seedfile = os.path.join('data', graphid + '-seed')
     seeds = GetSeeds(seedfile, count)
 
     # Create a pool of seeds.
@@ -188,7 +199,7 @@ def RunKNLatency(graphid, seedfile, count, depth, provider, label, threads, iter
         seedReports[seed].append({'avgN': avgN, 'totalTime': totalTime, 'threadId': threadId})
 
     print("Finalizing report")
-    output = FinalizeReport(depth, threads)
+    output = FinalizeReport(graphid, depth, threads)
     dirName = "./result_" + provider +"/"
     fileName = "KN-latency-k%d-threads%d-iter%d" %(depth, threads, iterations)
     outputPath = os.path.join(dirName, fileName)
diff --git a/benchmark/redisgraph/query_runner.py b/benchmark/redisgraph/query_runner.py
@@ -45,7 +45,7 @@ def KN(self, root, depth):
           query = "MATCH (s:%s)-[*%d]->(t) WHERE s.id=%d RETURN count(t)" % (self.label, int(depth), int(root))
           result = self.driver.execute_command('graph.query', self.graphid, query)
         except Exception as e:  # timeout, we return -1, reset session
-            print "Exception: %s" % e
+            print("Exception: %s" % e)
             raise e
             return -1
         else:
diff --git a/benchmark/redisgraph/redisgraph_load_graph500.sh b/benchmark/redisgraph/redisgraph_load_graph500.sh
@@ -7,7 +7,7 @@ fi
 python generate_graph500_inputs.py $2 || exit 1
 
 # Run RedisGraph bulk import script
-python $1/demo/bulk_insert/bulk_insert.py graph500 -n data/graph500-22_unique_node -r data/graph500-22 || exit 1
+python $1/demo/bulk_insert/bulk_insert.py graph500-22 -n data/graph500-22_unique_node -r data/graph500-22 || exit 1
 
 # Create index on node ID property
-~/redis/src/redis-cli GRAPH.QUERY graph500 "create index on :graph500-22_unique_node(id)"
+~/redis/src/redis-cli GRAPH.QUERY graph500-22 "create index on :graph500-22_unique_node(id)"
diff --git a/benchmark/redisgraph/redisgraph_load_twitter.sh b/benchmark/redisgraph/redisgraph_load_twitter.sh
@@ -7,7 +7,7 @@ fi
 python generate_twitter_inputs.py $2 || exit 1
 
 # Run RedisGraph bulk import script
-python $1/demo/bulk_insert/bulk_insert.py twitter_rv -n data/twitter_rv_net_unique_node -r data/twitter_rv || exit 1
+python $1/demo/bulk_insert/bulk_insert.py twitter_rv_net -n data/twitter_rv_net_unique_node -r data/twitter_rv_net || exit 1
 
 # Create index on node ID property
-~/redis/src/redis-cli GRAPH.QUERY twitter_rv "create index on :twitter_rv_net_unique_node(id)"
+~/redis/src/redis-cli GRAPH.QUERY twitter_rv_net "create index on :twitter_rv_net_unique_node(id)"