Skip to content

Couldn't able to save the model file while using Keras example with a local dataset. #398

Closed
@vamsinimmala1992

Description

@vamsinimmala1992

I am using a keras example with TensorflowOnSpark. I am trying to save the model file to the hdfs location I specify as args.

With no exception or errors in the Yarn Log the process have been completed, BUT I DONT SEE ANY MODEL FILE SAVED IN THE HDFS.

below is the code and the log. Please check.

Also I am not inputting any validation data in the fit_generator method. is it mandatory?

from __future__ import print_function


def main_fun(args, ctx):
	import numpy
	import os
	import tensorflow as tf
	from tensorflow.python import keras
	from tensorflow.python.keras import backend as K
	from tensorflow.python.keras.datasets import mnist
	from tensorflow.python.keras.models import Sequential, load_model, save_model
	from tensorflow.python.keras.layers import Dense, Dropout
	from tensorflow.python.keras.optimizers import RMSprop
	from tensorflow.python.keras.callbacks import LambdaCallback, TensorBoard
	from tensorflow.python.saved_model import builder as saved_model_builder
	from tensorflow.python.saved_model import tag_constants
	from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
	from tensorflowonspark import TFNode
	
	cluster, server = TFNode.start_cluster_server(ctx)
	
	if ctx.job_name == "ps":
		server.join()
	elif ctx.job_name == "worker":
		
		def generate_rdd_data(tf_feed, batch_size):
			print("generate_rdd_data invoked")
			while True:
				batch = tf_feed.next_batch(batch_size)
				feature_vector = []
				lbls = []
				for item in batch:
					feature_vector.append(item[0])
					lbls.append(item[1])
				features = numpy.array(feature_vector).astype('float32')
				labels = numpy.stack(lbls).astype('float32')
				yield (features, labels)
		
		with tf.device(tf.train.replica_device_setter(
		  worker_device="/job:worker/task:%d" % ctx.task_index,
		  cluster=cluster)):
			
			batch_size = 100
			num_classes = 14
			# args.mode == 'spark':
			x_train = tf.placeholder(tf.float32, [None, 28047], name="x_train")
			y_train = tf.placeholder(tf.float32, [None, 14], name="y_train")
			
			model = Sequential()
			model.add(Dense(512, activation='relu', input_shape=(28047,)))
			model.add(Dropout(0.2))
			model.add(Dense(512, activation='relu'))
			model.add(Dropout(0.2))
			model.add(Dense(14, activation='softmax'))
			
			model.summary()
			
			model.compile(loss='categorical_crossentropy',
			              optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
			              metrics=['accuracy'])
		
		saver = tf.train.Saver()
		
		with tf.Session(server.target) as sess:
			K.set_session(sess)
			
			def save_checkpoint(epoch, logs=None):
				if epoch == 1:
					tf.train.write_graph(sess.graph.as_graph_def(), args.model_dir, 'graph.pbtxt')
				saver.save(sess, os.path.join(args.model_dir, 'model.ckpt'), global_step=epoch * args.steps_per_epoch)
			
			#ckpt_callback = LambdaCallback(on_epoch_end=save_checkpoint)
			#tb_callback = TensorBoard(log_dir=args.model_dir, histogram_freq=1, write_graph=True, write_images=True)
			
			# Add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
			#callbacks = [ckpt_callback, tb_callback] if ctx.task_index == 0 else None
			
			# args.input_mode == 'spark':
			#  train on data read from a generator which is producing data from a Spark RDD
			tf_feed = TFNode.DataFeed(ctx.mgr)
			model.fit_generator(generator=generate_rdd_data(tf_feed, batch_size),
			                    steps_per_epoch=args.steps_per_epoch,
			                    epochs=args.epochs,
			                    verbose=1,
			                    callbacks=None)
			
			if args.export_dir and ctx.job_name == 'worker' and ctx.task_index == 0:
				# save a local Keras model, so we can reload it with an inferencing learning_phase
				save_model(model, "tmp_model")
				
				# reload the model
				K.set_learning_phase(False)
				new_model = load_model("tmp_model")
				
				# export a saved_model for inferencing
				builder = saved_model_builder.SavedModelBuilder(args.export_dir)
				signature = predict_signature_def(inputs={'fetures': new_model.input},
				                                  outputs={'scores': new_model.output})
				builder.add_meta_graph_and_variables(sess=sess,
				                                     tags=[tag_constants.SERVING],
				                                     signature_def_map={'predict': signature},
				                                     clear_devices=True)
				builder.save()
			
			if args.input_mode == 'spark':
				tf_feed.terminate()


if __name__ == '__main__':
	import argparse
	from pyspark.context import SparkContext
	from pyspark.conf import SparkConf
	from tensorflowonspark import TFCluster
	import keras
	
	sc = SparkContext(conf=SparkConf().setAppName("PhaseOneModelling"))
	executors = sc._conf.get("spark.executor.instances")
	num_executors = int(executors) if executors is not None else 1
	num_ps = 1
	
	parser = argparse.ArgumentParser()
	parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
	parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=20)
	parser.add_argument("--export_dir", help="directory to export saved_model")
	parser.add_argument("--data", help="HDFS path to data in parallelized CSV format")
	# parser.add_argument("--input_mode", help="input mode (tf|spark)", default="tf")
	parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized CSV format")
	parser.add_argument("--model_dir", help="directory to write model checkpoints")
	parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1)
	parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=100)
	parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
	
	args = parser.parse_args()
	print("args:", args)
	
	data = sc.textFile(args.data)
	data = data.map(lambda l: l.encode("UTF8", "ignore").split('\t'))
	
	labels = data.map(lambda x: x[1])
	data = data.map(lambda x: x[19:28066])
	
	header = data.first()
	data = data.filter(lambda line: line != header)
	label_header = labels.first()
	labels = labels.filter(lambda line: line != label_header)
	
	# convert values to float
	convertToFloat = lambda data: [float(str(x)) for x in data]
	dataset = data.map(convertToFloat)
	labels = labels.map(lambda x: float(x))
	labels = labels.map(lambda x: keras.utils.to_categorical(x, num_classes=14))
	
	# Split the data for train and validation
	#testRDD, trainRDD = data.randomSplit(weights=[0.001, 0.999], seed=42)
	#testlabelRDD, trainlabelRDD = labels.randomSplit(weights=[0.001, 0.999], seed=42)
	
	dataRDD = dataset.zip(labels)
	
	#dataRDD = dataRDD.sample(False, 0.01, 42)
	#trainRDD = trainRDD.zip(trainlabelRDD)
	
	cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard,
	                        TFCluster.InputMode.SPARK, log_dir=args.model_dir)
	cluster.train(dataRDD, args.epochs)

	cluster.shutdown()

And the log as follows:

Using TensorFlow backend.
19/03/03 22:48:13 INFO SparkContext: Running Spark version 2.3.0.2.6.5.0-292
19/03/03 22:48:13 INFO SparkContext: Submitted application: PhaseOneModelling
19/03/03 22:48:13 INFO SecurityManager: Changing view acls to: Surya@..com
19/03/03 22:48:13 INFO SecurityManager: Changing modify acls to: Surya@..com
19/03/03 22:48:13 INFO SecurityManager: Changing view acls groups to:
19/03/03 22:48:13 INFO SecurityManager: Changing modify acls groups to:
19/03/03 22:48:13 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(Surya@..com); groups with view permissions: Se                                                t(); users  with modify permissions: Set(Surya@..com); groups with modify permissions: Set()
19/03/03 22:48:13 INFO Utils: Successfully started service 'sparkDriver' on port 44164.
19/03/03 22:48:13 INFO SparkEnv: Registering MapOutputTracker
19/03/03 22:48:13 INFO SparkEnv: Registering BlockManagerMaster
19/03/03 22:48:13 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
19/03/03 22:48:13 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
19/03/03 22:48:13 INFO DiskBlockManager: Created local directory at /tmp/blockmgr-966d6dac-7f6b-4411-91e2-7b4c07185c8d
19/03/03 22:48:13 INFO MemoryStore: MemoryStore started with capacity 153.4 GB
19/03/03 22:48:13 INFO SparkEnv: Registering OutputCommitCoordinator
19/03/03 22:48:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
19/03/03 22:48:14 INFO Utils: Successfully started service 'SparkUI' on port 4041.
19/03/03 22:48:14 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://instance-2026033.ipa.ba..com:4041
19/03/03 22:48:15 INFO RMProxy: Connecting to ResourceManager at instance-2026030.ipa.ba..com/10.28.26.30:8050
19/03/03 22:48:15 INFO Client: Requesting a new application from cluster with 14 NodeManagers
19/03/03 22:48:15 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (202752 MB per container)
19/03/03 22:48:15 INFO Client: Will allocate AM container, with 896 MB memory including 384 MB overhead
19/03/03 22:48:15 INFO Client: Setting up container launch context for our AM
19/03/03 22:48:15 INFO Client: Setting up the launch environment for our AM container
19/03/03 22:48:15 INFO Client: Preparing resources for our AM container
19/03/03 22:48:15 INFO HadoopFSDelegationTokenProvider: getting token for: DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-438946188_31, ugi=Surya@..com (auth:KERBEROS)]]
19/03/03 22:48:16 INFO KerberosName: Non-simple name Surya@..com after auth_to_local rule RULE:[1:$1@$0](.*@..com)/L
19/03/03 22:48:16 INFO DFSClient: Created HDFS_DELEGATION_TOKEN token 6651 for Surya@..com on 10.28.26.29:8020
19/03/03 22:48:19 INFO Client: Use hdfs cache file as spark.yarn.archive for HDP, hdfsCacheFile:hdfs://instance-2026029.ipa.ba..com:8020/hdp/apps/2.6.5.0-292/spark2/spark2-hdp-yarn-archive                                                .tar.gz
19/03/03 22:48:19 INFO Client: Source and destination file systems are the same. Not copying hdfs://instance-2026029.ipa.ba..com:8020/hdp/apps/2.6.5.0-292/spark2/spark2-hdp-yarn-archive.ta                                                r.gz
19/03/03 22:48:19 INFO Client: Uploading resource file:/usr/hdp/2.6.5.0-292/spark2/python/lib/pyspark.zip -> hdfs://instance-2026029.ipa.ba..com:8020/user/Surya@..com/.sparkStagin                                                g/application_1551114784635_0177/pyspark.zip
19/03/03 22:48:19 INFO Client: Uploading resource file:/usr/hdp/2.6.5.0-292/spark2/python/lib/py4j-0.10.6-src.zip -> hdfs://instance-2026029.ipa.ba..com:8020/user/Surya@..com/.spa                                                rkStaging/application_1551114784635_0177/py4j-0.10.6-src.zip
19/03/03 22:48:19 INFO Client: Uploading resource file:/tmp/spark-2c81098d-4f91-4a1d-87ea-71ecf1c72204/__spark_conf__626918427791303293.zip -> hdfs://instance-2026029.ipa.ba..com:8020/user                                                /Surya@..com/.sparkStaging/application_1551114784635_0177/__spark_conf__.zip
19/03/03 22:48:19 INFO SecurityManager: Changing view acls to: Surya@..com
19/03/03 22:48:19 INFO SecurityManager: Changing modify acls to: Surya@..com
19/03/03 22:48:19 INFO SecurityManager: Changing view acls groups to:
19/03/03 22:48:19 INFO SecurityManager: Changing modify acls groups to:
19/03/03 22:48:19 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users  with view permissions: Set(Surya@..com); groups with view permissions: Se                                                t(); users  with modify permissions: Set(Surya@..com); groups with modify permissions: Set()
19/03/03 22:48:19 INFO Client: Submitting application application_1551114784635_0177 to ResourceManager
19/03/03 22:48:22 INFO YarnClientImpl: Submitted application application_1551114784635_0177
19/03/03 22:48:22 INFO SchedulerExtensionServices: Starting Yarn extension services with app application_1551114784635_0177 and attemptId None
19/03/03 22:48:23 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:23 INFO Client:
         client token: Token { kind: YARN_CLIENT_TOKEN, service:  }
         diagnostics: AM container is launched, waiting for AM container to Register with RM
         ApplicationMaster host: N/A
         ApplicationMaster RPC port: -1
         queue: production
         start time: 1551671302538
         final status: UNDEFINED
         tracking URL: https://instance-2026030.ipa.ba..com:8090/proxy/application_1551114784635_0177/
         user: Surya@..com
19/03/03 22:48:24 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:25 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:26 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:27 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:28 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:29 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:30 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:31 INFO YarnClientSchedulerBackend: Add WebUI Filter. org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter, Map(PROXY_HOSTS -> instance-2026030.ipa.ba..com, PROXY_URI_                                                BASES -> https://instance-2026030.ipa.ba..com:8090/proxy/application_1551114784635_0177), /proxy/application_1551114784635_0177
19/03/03 22:48:31 INFO JettyUtils: Adding filter: org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
19/03/03 22:48:31 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:31 INFO YarnSchedulerBackend$YarnSchedulerEndpoint: ApplicationMaster registered as NettyRpcEndpointRef(spark-client://YarnAM)
19/03/03 22:48:32 INFO Client: Application report for application_1551114784635_0177 (state: RUNNING)
19/03/03 22:48:32 INFO Client:
         client token: Token { kind: YARN_CLIENT_TOKEN, service:  }
         diagnostics: N/A
         ApplicationMaster host: 10.28.26.40
         ApplicationMaster RPC port: 0
         queue: production
         start time: 1551671302538
         final status: UNDEFINED
         tracking URL: https://instance-2026030.ipa.ba..com:8090/proxy/application_1551114784635_0177/
         user: Surya@..com
19/03/03 22:48:32 INFO YarnClientSchedulerBackend: Application application_1551114784635_0177 has started running.
19/03/03 22:48:32 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 38256.
19/03/03 22:48:32 INFO NettyBlockTransferService: Server created on instance-2026033.ipa.ba..com:38256
19/03/03 22:48:32 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
19/03/03 22:48:32 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, instance-2026033.ipa.ba..com, 38256, None)
19/03/03 22:48:32 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026033.ipa.ba..com:38256 with 153.4 GB RAM, BlockManagerId(driver, instance-2026033.ipa.ba..com, 382                                                56, None)
19/03/03 22:48:32 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, instance-2026033.ipa.ba..com, 38256, None)
19/03/03 22:48:32 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, instance-2026033.ipa.ba..com, 38256, None)
19/03/03 22:48:33 INFO EventLoggingListener: Logging events to hdfs:/spark2-history/application_1551114784635_0177
19/03/03 22:48:38 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.39:36672) with ID 3
19/03/03 22:48:38 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026039.ipa.ba..com:33711 with 153.4 GB RAM, BlockManagerId(3, instance-2026039.ipa.ba..com, 33711, N                                                one)
19/03/03 22:48:39 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.38:49710) with ID 2
19/03/03 22:48:39 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026038.ipa.ba..com:45476 with 153.4 GB RAM, BlockManagerId(2, instance-2026038.ipa.ba..com, 45476, N                                                one)
19/03/03 22:48:39 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.45:36844) with ID 1
19/03/03 22:48:39 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026045.ipa.ba..com:45485 with 153.4 GB RAM, BlockManagerId(1, instance-2026045.ipa.ba..com, 45485, N                                                one)
19/03/03 22:48:39 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.46:58368) with ID 4
19/03/03 22:48:39 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.43:54760) with ID 6
19/03/03 22:48:39 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026046.ipa.ba..com:41346 with 153.4 GB RAM, BlockManagerId(4, instance-2026046.ipa.ba..com, 41346, N                                                one)
19/03/03 22:48:40 INFO YarnClientSchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.8
19/03/03 22:48:40 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026043.ipa.ba..com:33242 with 153.4 GB RAM, BlockManagerId(6, instance-2026043.ipa.ba..com, 33242, N                                                one)
args: Namespace(cluster_size=6, data='/user/imagen.admins/NormalizedAugustData/Wide/ReducedFeatures/wide_august_tf_idf_normalized_with_col_ReducedFeatures.tsv', epochs=5, export_dir='/tmp/m                                                ss/TensorflowOnSpark/export_dir/', labels=None, model_dir='/tmp/mss/TensorflowOnSpark/model_dir', num_ps=1, steps_per_epoch=2622, tensorboard=False)
19/03/03 22:48:40 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 466.7 KB, free 153.4 GB)
19/03/03 22:48:40 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 34.0 KB, free 153.4 GB)
19/03/03 22:48:40 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:40 INFO SparkContext: Created broadcast 0 from textFile at NativeMethodAccessorImpl.java:0
19/03/03 22:48:40 INFO KerberosName: Non-simple name Surya@..com after auth_to_local rule RULE:[1:$1@$0](.*@..com)/L
19/03/03 22:48:40 INFO DFSClient: Created HDFS_DELEGATION_TOKEN token 6652 for Surya@..com on 10.28.26.29:8020
19/03/03 22:48:40 INFO TokenCache: Got dt for hdfs://instance-2026029.ipa.ba..com:8020; Kind: HDFS_DELEGATION_TOKEN, Service: 10.28.26.29:8020, Ident: (HDFS_DELEGATION_TOKEN token 6652 fo                                                r Surya@..com)
19/03/03 22:48:40 INFO FileInputFormat: Total input paths to process : 1
19/03/03 22:48:40 INFO SparkContext: Starting job: runJob at PythonRDD.scala:141
19/03/03 22:48:40 INFO DAGScheduler: Got job 0 (runJob at PythonRDD.scala:141) with 1 output partitions
19/03/03 22:48:40 INFO DAGScheduler: Final stage: ResultStage 0 (runJob at PythonRDD.scala:141)
19/03/03 22:48:40 INFO DAGScheduler: Parents of final stage: List()
19/03/03 22:48:40 INFO DAGScheduler: Missing parents: List()
19/03/03 22:48:40 INFO DAGScheduler: Submitting ResultStage 0 (PythonRDD[2] at RDD at PythonRDD.scala:48), which has no missing parents
19/03/03 22:48:40 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 6.4 KB, free 153.4 GB)
19/03/03 22:48:40 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 4.0 KB, free 153.4 GB)
19/03/03 22:48:40 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 4.0 KB, free: 153.4 GB)
19/03/03 22:48:40 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1039
19/03/03 22:48:40 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (PythonRDD[2] at RDD at PythonRDD.scala:48) (first 15 tasks are for partitions Vector(0))
19/03/03 22:48:40 INFO YarnScheduler: Adding task set 0.0 with 1 tasks
19/03/03 22:48:40 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, instance-2026046.ipa.ba..com, executor 4, partition 0, RACK_LOCAL, 8020 bytes)
19/03/03 22:48:41 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.37:34390) with ID 5
19/03/03 22:48:41 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026037.ipa.ba..com:45022 with 153.4 GB RAM, BlockManagerId(5, instance-2026037.ipa.ba..com, 45022, N                                                one)
19/03/03 22:48:41 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 4.0 KB, free: 153.4 GB)
19/03/03 22:48:41 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:43 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 2887 ms on instance-2026046.ipa.ba..com (executor 4) (1/1)
19/03/03 22:48:43 INFO YarnScheduler: Removed TaskSet 0.0, whose tasks have all completed, from pool
19/03/03 22:48:43 INFO DAGScheduler: ResultStage 0 (runJob at PythonRDD.scala:141) finished in 3.100 s
19/03/03 22:48:43 INFO DAGScheduler: Job 0 finished: runJob at PythonRDD.scala:141, took 3.163148 s
19/03/03 22:48:43 INFO SparkContext: Starting job: runJob at PythonRDD.scala:141
19/03/03 22:48:43 INFO DAGScheduler: Got job 1 (runJob at PythonRDD.scala:141) with 1 output partitions
19/03/03 22:48:43 INFO DAGScheduler: Final stage: ResultStage 1 (runJob at PythonRDD.scala:141)
19/03/03 22:48:43 INFO DAGScheduler: Parents of final stage: List()
19/03/03 22:48:43 INFO DAGScheduler: Missing parents: List()
19/03/03 22:48:43 INFO DAGScheduler: Submitting ResultStage 1 (PythonRDD[3] at RDD at PythonRDD.scala:48), which has no missing parents
19/03/03 22:48:43 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 6.3 KB, free 153.4 GB)
19/03/03 22:48:43 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 4.0 KB, free 153.4 GB)
19/03/03 22:48:43 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 4.0 KB, free: 153.4 GB)
19/03/03 22:48:43 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1039
19/03/03 22:48:43 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (PythonRDD[3] at RDD at PythonRDD.scala:48) (first 15 tasks are for partitions Vector(0))
19/03/03 22:48:43 INFO YarnScheduler: Adding task set 1.0 with 1 tasks
19/03/03 22:48:43 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, instance-2026037.ipa.ba..com, executor 5, partition 0, NODE_LOCAL, 8020 bytes)
19/03/03 22:48:44 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 4.0 KB, free: 153.4 GB)
19/03/03 22:48:44 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:45 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 2121 ms on instance-2026037.ipa.ba..com (executor 5) (1/1)
19/03/03 22:48:45 INFO YarnScheduler: Removed TaskSet 1.0, whose tasks have all completed, from pool
19/03/03 22:48:45 INFO DAGScheduler: ResultStage 1 (runJob at PythonRDD.scala:141) finished in 2.129 s
19/03/03 22:48:45 INFO DAGScheduler: Job 1 finished: runJob at PythonRDD.scala:141, took 2.132856 s
2019-03-03 22:48:46,218 INFO (MainThread-75262) Reserving TFSparkNodes
2019-03-03 22:48:46,218 INFO (MainThread-75262) cluster_template: {'ps': [0], 'worker': [1, 2, 3, 4, 5]}
2019-03-03 22:48:46,219 INFO (MainThread-75262) listening for reservations at ('10.28.26.33', 42213)
2019-03-03 22:48:46,219 INFO (MainThread-75262) Starting TensorFlow on executors
2019-03-03 22:48:46,224 INFO (MainThread-75262) Waiting for TFSparkNodes to start
2019-03-03 22:48:46,224 INFO (MainThread-75262) waiting for 6 reservations
19/03/03 22:48:46 INFO SparkContext: Starting job: foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301
19/03/03 22:48:46 INFO DAGScheduler: Got job 2 (foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301) with 6 output partitions
19/03/03 22:48:46 INFO DAGScheduler: Final stage: ResultStage 2 (foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301)
19/03/03 22:48:46 INFO DAGScheduler: Parents of final stage: List()
19/03/03 22:48:46 INFO DAGScheduler: Missing parents: List()
19/03/03 22:48:46 INFO DAGScheduler: Submitting ResultStage 2 (PythonRDD[8] at foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301), which has no missing                                                 parents
19/03/03 22:48:46 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 17.4 KB, free 153.4 GB)
19/03/03 22:48:46 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 12.5 KB, free 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:1039
19/03/03 22:48:46 INFO DAGScheduler: Submitting 6 missing tasks from ResultStage 2 (PythonRDD[8] at foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301)                                                 (first 15 tasks are for partitions Vector(0, 1, 2, 3, 4, 5))
19/03/03 22:48:46 INFO YarnScheduler: Adding task set 2.0 with 6 tasks
19/03/03 22:48:46 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 2, instance-2026039.ipa.ba..com, executor 3, partition 0, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 1.0 in stage 2.0 (TID 3, instance-2026046.ipa.ba..com, executor 4, partition 1, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 2.0 in stage 2.0 (TID 4, instance-2026038.ipa.ba..com, executor 2, partition 2, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 3.0 in stage 2.0 (TID 5, instance-2026037.ipa.ba..com, executor 5, partition 3, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 4.0 in stage 2.0 (TID 6, instance-2026043.ipa.ba..com, executor 6, partition 4, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 5.0 in stage 2.0 (TID 7, instance-2026045.ipa.ba..com, executor 1, partition 5, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026039.ipa.ba..com:33711 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026038.ipa.ba..com:45476 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026045.ipa.ba..com:45485 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026043.ipa.ba..com:33242 (size: 12.5 KB, free: 153.4 GB)
2019-03-03 22:48:47,226 INFO (MainThread-75262) waiting for 6 reservations
2019-03-03 22:48:48,226 INFO (MainThread-75262) waiting for 4 reservations
2019-03-03 22:48:49,228 INFO (MainThread-75262) all reservations completed
2019-03-03 22:48:49,228 INFO (MainThread-75262) All TFSparkNodes started
2019-03-03 22:48:49,228 INFO (MainThread-75262) {'executor_id': 3, 'addr': '/tmp/pymp-Nm3GK_/listener-psN9ka', 'task_index': 2, 'job_name': 'worker', 'authkey': '\x19\x83\xec\x1e\xacNM\x16\                                                x89\xc0\xa0,\x9b\x87$\xd3', 'host': '10.28.26.37', 'port': 34616, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,228 INFO (MainThread-75262) {'executor_id': 1, 'addr': '/tmp/pymp-anFXbv/listener-gYfcyl', 'task_index': 0, 'job_name': 'worker', 'authkey': '\xca\xfe\xf8+k\xbfC\n\xbfI\                                                x19\xc7=\xefR\x17', 'host': '10.28.26.46', 'port': 35676, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,228 INFO (MainThread-75262) {'executor_id': 2, 'addr': '/tmp/pymp-Ws00Ww/listener-J06992', 'task_index': 1, 'job_name': 'worker', 'authkey': '?\xc8\xef\xde\x98\xb3EB\x8f                                                O\x80\x89\xeb\xff\x83\x91', 'host': '10.28.26.38', 'port': 34594, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,228 INFO (MainThread-75262) {'executor_id': 0, 'addr': ('10.28.26.39', 45967), 'task_index': 0, 'job_name': 'ps', 'authkey': '\xce\x1b$\xbeg6@T\xb0q\xb8I\x04\xc5\x1a\r'                                                , 'host': '10.28.26.39', 'port': 39537, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,229 INFO (MainThread-75262) {'executor_id': 4, 'addr': '/tmp/pymp-9cxRDb/listener-Vfnobx', 'task_index': 3, 'job_name': 'worker', 'authkey': '\xad\xf6\x10\xb6\x9f\x13O\x                                                c1\xbb\xb0\xd6\x85\xb3e\x16\xee', 'host': '10.28.26.43', 'port': 35727, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,229 INFO (MainThread-75262) {'executor_id': 5, 'addr': '/tmp/pymp-C7cs51/listener-KKXXmn', 'task_index': 4, 'job_name': 'worker', 'authkey': ';\xbe"\xd73\xcdLD\xa559Z\xc                                                b{\x92\x92', 'host': '10.28.26.45', 'port': 46460, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,229 INFO (MainThread-75262) Feeding training data
19/03/03 22:48:49 INFO SparkContext: Starting job: collect at PythonRDD.scala:153
19/03/03 22:48:49 INFO DAGScheduler: Got job 3 (collect at PythonRDD.scala:153) with 600 output partitions
19/03/03 22:48:49 INFO DAGScheduler: Final stage: ResultStage 3 (collect at PythonRDD.scala:153)
19/03/03 22:48:49 INFO DAGScheduler: Parents of final stage: List()
19/03/03 22:48:49 INFO DAGScheduler: Missing parents: List()
19/03/03 22:48:49 INFO DAGScheduler: Submitting ResultStage 3 (PythonRDD[10] at RDD at PythonRDD.scala:48), which has no missing parents
19/03/03 22:48:49 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 389.2 KB, free 153.4 GB)
19/03/03 22:48:49 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 242.0 KB, free 153.4 GB)
19/03/03 22:48:49 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:49 INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:1039
19/03/03 22:48:49 INFO DAGScheduler: Submitting 600 missing tasks from ResultStage 3 (PythonRDD[10] at RDD at PythonRDD.scala:48) (first 15 tasks are for partitions Vector(0, 1, 2, 3, 4, 5,                                                 6, 7, 8, 9, 10, 11, 12, 13, 14))
19/03/03 22:48:49 INFO YarnScheduler: Adding task set 3.0 with 600 tasks
19/03/03 22:48:49 INFO TaskSetManager: Finished task 5.0 in stage 2.0 (TID 7) in 3490 ms on instance-2026045.ipa.ba..com (executor 1) (1/6)
19/03/03 22:48:49 INFO TaskSetManager: Starting task 0.0 in stage 3.0 (TID 8, instance-2026037.ipa.ba..com, executor 5, partition 0, NODE_LOCAL, 8440 bytes)
19/03/03 22:48:49 INFO TaskSetManager: Finished task 3.0 in stage 2.0 (TID 5) in 3569 ms on instance-2026037.ipa.ba..com (executor 5) (2/6)
19/03/03 22:48:49 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:50 INFO TaskSetManager: Finished task 1.0 in stage 2.0 (TID 3) in 3867 ms on instance-2026046.ipa.ba..com (executor 4) (3/6)
19/03/03 22:48:50 INFO TaskSetManager: Starting task 3.0 in stage 3.0 (TID 9, instance-2026038.ipa.ba..com, executor 2, partition 3, NODE_LOCAL, 8440 bytes)
19/03/03 22:48:50 INFO TaskSetManager: Finished task 2.0 in stage 2.0 (TID 4) in 4240 ms on instance-2026038.ipa.ba..com (executor 2) (4/6)
19/03/03 22:48:50 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026038.ipa.ba..com:45476 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:50 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026038.ipa.ba..com:45476 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:50 INFO TaskSetManager: Starting task 1.0 in stage 3.0 (TID 10, instance-2026043.ipa.ba..com, executor 6, partition 1, NODE_LOCAL, 8440 bytes)
19/03/03 22:48:50 INFO TaskSetManager: Finished task 4.0 in stage 2.0 (TID 6) in 4423 ms on instance-2026043.ipa.ba..com (executor 6) (5/6)
19/03/03 22:48:50 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026043.ipa.ba..com:33242 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:50 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026043.ipa.ba..com:33242 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:54 INFO TaskSetManager: Starting task 2.0 in stage 3.0 (TID 11, instance-2026046.ipa.ba..com, executor 4, partition 2, RACK_LOCAL, 8440 bytes)
19/03/03 22:48:54 INFO TaskSetManager: Starting task 4.0 in stage 3.0 (TID 12, instance-2026045.ipa.ba..com, executor 1, partition 4, RACK_LOCAL, 8440 bytes)
19/03/03 22:48:54 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026045.ipa.ba..com:45485 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:54 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:54 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026045.ipa.ba..com:45485 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:49:35 INFO TaskSetManager: Starting task 5.0 in stage 3.0 (TID 13, instance-2026038.ipa.ba..com, executor 2, partition 5, NODE_LOCAL, 8440 bytes)
19/03/03 22:49:35 INFO TaskSetManager: Finished task 3.0 in stage 3.0 (TID 9) in 45462 ms on instance-2026038.ipa.ba..com (executor 2) (1/600)
19/03/03 22:49:36 INFO TaskSetManager: Starting task 6.0 in stage 3.0 (TID 14, instance-2026037.ipa.ba..com, executor 5, partition 6, NODE_LOCAL, 8440 bytes)
19/03/03 22:49:36 INFO TaskSetManager: Finished task 0.0 in stage 3.0 (TID 8) in 46239 ms on instance-2026037.ipa.ba..com (executor 5) (2/600)
19/03/03 22:49:36 INFO TaskSetManager: Starting task 7.0 in stage 3.0 (TID 15, instance-2026043.ipa.ba..com, executor 6, partition 7, NODE_LOCAL, 8440 bytes)
19/03/03 22:49:36 INFO TaskSetManager: Finished task 1.0 in stage 3.0 (TID 10) in 45790 ms on instance-2026043.ipa.ba..com (executor 6) (3/600)
19/03/03 22:49:39 INFO TaskSetManager: Finished task 2.0 in stage 3.0 (TID 11) in 44513 ms on instance-2026046.ipa.ba..com (executor 4) (4/600)
19/03/03 22:49:39 INFO TaskSetManager: Starting task 8.0 in stage 3.0 (TID 16, instance-2026046.ipa.ba..com, executor 4, partition 8, RACK_LOCAL, 8440 bytes)
19/03/03 22:49:40 INFO TaskSetManager: Starting task 9.0 in stage 3.0 (TID 17, instance-2026045.ipa.ba..com, executor 1, partition 9, RACK_LOCAL, 8440 bytes)
19/03/03 22:49:40 INFO TaskSetManager: Finished task 4.0 in stage 3.0 (TID 12) in 45791 ms on instance-2026045.ipa.ba..com (executor 1) (5/600)
19/03/03 22:50:15 INFO TaskSetManager: Starting task 21.0 in stage 3.0 (TID 18, instance-2026038.ipa.ba..com, executor 2, partition 21, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:15 INFO TaskSetManager: Finished task 5.0 in stage 3.0 (TID 13) in 39597 ms on instance-2026038.ipa.ba..com (executor 2) (6/600)
19/03/03 22:50:16 INFO TaskSetManager: Starting task 11.0 in stage 3.0 (TID 19, instance-2026043.ipa.ba..com, executor 6, partition 11, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:16 INFO TaskSetManager: Finished task 7.0 in stage 3.0 (TID 15) in 39619 ms on instance-2026043.ipa.ba..com (executor 6) (7/600)
19/03/03 22:50:17 INFO TaskSetManager: Starting task 10.0 in stage 3.0 (TID 20, instance-2026037.ipa.ba..com, executor 5, partition 10, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:17 INFO TaskSetManager: Finished task 6.0 in stage 3.0 (TID 14) in 41346 ms on instance-2026037.ipa.ba..com (executor 5) (8/600)
19/03/03 22:50:19 INFO TaskSetManager: Finished task 8.0 in stage 3.0 (TID 16) in 39872 ms on instance-2026046.ipa.ba..com (executor 4) (9/600)
19/03/03 22:50:20 INFO TaskSetManager: Starting task 12.0 in stage 3.0 (TID 21, instance-2026046.ipa.ba..com, executor 4, partition 12, RACK_LOCAL, 8440 bytes)
19/03/03 22:50:24 INFO TaskSetManager: Starting task 13.0 in stage 3.0 (TID 22, instance-2026045.ipa.ba..com, executor 1, partition 13, RACK_LOCAL, 8440 bytes)
19/03/03 22:50:24 INFO TaskSetManager: Finished task 9.0 in stage 3.0 (TID 17) in 43885 ms on instance-2026045.ipa.ba..com (executor 1) (10/600)
19/03/03 22:50:54 INFO TaskSetManager: Starting task 26.0 in stage 3.0 (TID 23, instance-2026038.ipa.ba..com, executor 2, partition 26, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:54 INFO TaskSetManager: Finished task 21.0 in stage 3.0 (TID 18) in 39348 ms on instance-2026038.ipa.ba..com (executor 2) (11/600)
19/03/03 22:50:55 INFO TaskSetManager: Starting task 14.0 in stage 3.0 (TID 24, instance-2026043.ipa.ba..com, executor 6, partition 14, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:55 INFO TaskSetManager: Finished task 11.0 in stage 3.0 (TID 19) in 39112 ms on instance-2026043.ipa.ba..com (executor 6) (12/600)
19/03/03 22:50:58 INFO TaskSetManager: Starting task 24.0 in stage 3.0 (TID 25, instance-2026037.ipa.ba..com, executor 5, partition 24, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:58 INFO TaskSetManager: Finished task 10.0 in stage 3.0 (TID 20) in 41423 ms on instance-2026037.ipa.ba..com (executor 5) (13/600)
19/03/03 22:51:00 INFO TaskSetManager: Finished task 12.0 in stage 3.0 (TID 21) in 39971 ms on instance-2026046.ipa.ba..com (executor 4) (14/600)
19/03/03 22:51:02 INFO TaskSetManager: Starting task 15.0 in stage 3.0 (TID 26, instance-2026046.ipa.ba..com, executor 4, partition 15, RACK_LOCAL, 8440 bytes)
19/03/03 22:51:03 INFO TaskSetManager: Starting task 16.0 in stage 3.0 (TID 27, instance-2026045.ipa.ba..com, executor 1, partition 16, RACK_LOCAL, 8440 bytes)
19/03/03 22:51:03 INFO TaskSetManager: Finished task 13.0 in stage 3.0 (TID 22) in 39738 ms on instance-2026045.ipa.ba..com (executor 1) (15/600)
19/03/03 22:51:34 INFO TaskSetManager: Starting task 32.0 in stage 3.0 (TID 28, instance-2026043.ipa.ba..com, executor 6, partition 32, NODE_LOCAL, 8440 bytes)
19/03/03 22:51:34 INFO TaskSetManager: Finished task 14.0 in stage 3.0 (TID 24) in 39273 ms on instance-2026043.ipa.ba..com (executor 6) (16/600)
19/03/03 22:51:34 INFO TaskSetManager: Starting task 31.0 in stage 3.0 (TID 29, instance-2026038.ipa.ba..com, executor 2, partition 31, NODE_LOCAL, 8440 bytes)
19/03/03 22:51:34 INFO TaskSetManager: Finished task 26.0 in stage 3.0 (TID 23) in 39756 ms on instance-2026038.ipa.ba..com (executor 2) (17/600)
19/03/03 22:51:40 INFO TaskSetManager: Starting task 25.0 in stage 3.0 (TID 30, instance-2026037.ipa.ba..com, executor 5, partition 25, NODE_LOCAL, 8440 bytes)
19/03/03 22:51:40 INFO TaskSetManager: Finished task 24.0 in stage 3.0 (TID 25) in 41287 ms on instance-2026037.ipa.ba..com (executor 5) (18/600)
19/03/03 22:51:42 INFO TaskSetManager: Finished task 15.0 in stage 3.0 (TID 26) in 39507 ms on instance-2026046.ipa.ba..com (executor 4) (19/600)
19/03/03 22:51:43 INFO TaskSetManager: Starting task 17.0 in stage 3.0 (TID 31, instance-2026045.ipa.ba..com, executor 1, partition 17, RACK_LOCAL, 8440 bytes)
19/03/03 22:51:43 INFO TaskSetManager: Finished task 16.0 in stage 3.0 (TID 27) in 39474 ms on instance-2026045.ipa.ba..com (executor 1) (20/600)
19/03/04 00:07:54 INFO TaskSetManager: Starting task 541.0 in stage 3.0 (TID 583, instance-2026038.ipa.ba..com, executor 2, partition 541, RACK_LOCAL, 8440 bytes)
19/03/04 00:07:54 INFO TaskSetManager: Finished task 533.0 in stage 3.0 (TID 579) in 40271 ms on instance-2026038.ipa.ba..com (executor 2) (571/600)
19/03/04 00:07:57 INFO TaskSetManager: Starting task 543.0 in stage 3.0 (TID 584, instance-2026037.ipa.ba..com, executor 5, partition 543, RACK_LOCAL, 8440 bytes)
19/03/04 00:07:57 INFO TaskSetManager: Finished task 532.0 in stage 3.0 (TID 578) in 44489 ms on instance-2026037.ipa.ba..com (executor 5) (572/600)
19/03/04 00:08:12 INFO TaskSetManager: Starting task 544.0 in stage 3.0 (TID 585, instance-2026043.ipa.ba..com, executor 6, partition 544, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:12 INFO TaskSetManager: Finished task 534.0 in stage 3.0 (TID 580) in 40645 ms on instance-2026043.ipa.ba..com (executor 6) (573/600)
19/03/04 00:08:17 INFO TaskSetManager: Starting task 545.0 in stage 3.0 (TID 586, instance-2026045.ipa.ba..com, executor 1, partition 545, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:17 INFO TaskSetManager: Finished task 535.0 in stage 3.0 (TID 581) in 40536 ms on instance-2026045.ipa.ba..com (executor 1) (574/600)
19/03/04 00:08:31 INFO TaskSetManager: Starting task 548.0 in stage 3.0 (TID 587, instance-2026046.ipa.ba..com, executor 4, partition 548, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:31 INFO TaskSetManager: Finished task 540.0 in stage 3.0 (TID 582) in 38886 ms on instance-2026046.ipa.ba..com (executor 4) (575/600)
19/03/04 00:08:36 INFO TaskSetManager: Starting task 551.0 in stage 3.0 (TID 588, instance-2026038.ipa.ba..com, executor 2, partition 551, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:36 INFO TaskSetManager: Finished task 541.0 in stage 3.0 (TID 583) in 41700 ms on instance-2026038.ipa.ba..com (executor 2) (576/600)
19/03/04 00:08:41 INFO TaskSetManager: Starting task 554.0 in stage 3.0 (TID 589, instance-2026037.ipa.ba..com, executor 5, partition 554, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:41 INFO TaskSetManager: Finished task 543.0 in stage 3.0 (TID 584) in 44824 ms on instance-2026037.ipa.ba..com (executor 5) (577/600)
19/03/04 00:08:54 INFO TaskSetManager: Starting task 555.0 in stage 3.0 (TID 590, instance-2026043.ipa.ba..com, executor 6, partition 555, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:54 INFO TaskSetManager: Finished task 544.0 in stage 3.0 (TID 585) in 41618 ms on instance-2026043.ipa.ba..com (executor 6) (578/600)
19/03/04 00:08:58 INFO TaskSetManager: Starting task 558.0 in stage 3.0 (TID 591, instance-2026045.ipa.ba..com, executor 1, partition 558, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:58 INFO TaskSetManager: Finished task 545.0 in stage 3.0 (TID 586) in 40567 ms on instance-2026045.ipa.ba..com (executor 1) (579/600)
19/03/04 00:09:10 INFO TaskSetManager: Starting task 559.0 in stage 3.0 (TID 592, instance-2026046.ipa.ba..com, executor 4, partition 559, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:10 INFO TaskSetManager: Finished task 548.0 in stage 3.0 (TID 587) in 39338 ms on instance-2026046.ipa.ba..com (executor 4) (580/600)
19/03/04 00:09:17 INFO TaskSetManager: Starting task 562.0 in stage 3.0 (TID 593, instance-2026038.ipa.ba..com, executor 2, partition 562, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:17 INFO TaskSetManager: Finished task 551.0 in stage 3.0 (TID 588) in 41027 ms on instance-2026038.ipa.ba..com (executor 2) (581/600)
19/03/04 00:09:27 INFO TaskSetManager: Starting task 563.0 in stage 3.0 (TID 594, instance-2026037.ipa.ba..com, executor 5, partition 563, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:27 INFO TaskSetManager: Finished task 554.0 in stage 3.0 (TID 589) in 45716 ms on instance-2026037.ipa.ba..com (executor 5) (582/600)
19/03/04 00:09:35 INFO TaskSetManager: Starting task 565.0 in stage 3.0 (TID 595, instance-2026043.ipa.ba..com, executor 6, partition 565, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:35 INFO TaskSetManager: Finished task 555.0 in stage 3.0 (TID 590) in 41138 ms on instance-2026043.ipa.ba..com (executor 6) (583/600)
19/03/04 00:09:40 INFO TaskSetManager: Starting task 568.0 in stage 3.0 (TID 596, instance-2026045.ipa.ba..com, executor 1, partition 568, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:40 INFO TaskSetManager: Finished task 558.0 in stage 3.0 (TID 591) in 42608 ms on instance-2026045.ipa.ba..com (executor 1) (584/600)
19/03/04 00:09:50 INFO TaskSetManager: Starting task 574.0 in stage 3.0 (TID 597, instance-2026046.ipa.ba..com, executor 4, partition 574, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:50 INFO TaskSetManager: Finished task 559.0 in stage 3.0 (TID 592) in 39112 ms on instance-2026046.ipa.ba..com (executor 4) (585/600)
19/03/04 00:09:58 INFO TaskSetManager: Starting task 578.0 in stage 3.0 (TID 598, instance-2026038.ipa.ba..com, executor 2, partition 578, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:58 INFO TaskSetManager: Finished task 562.0 in stage 3.0 (TID 593) in 41681 ms on instance-2026038.ipa.ba..com (executor 2) (586/600)
19/03/04 00:10:08 INFO TaskSetManager: Starting task 579.0 in stage 3.0 (TID 599, instance-2026037.ipa.ba..com, executor 5, partition 579, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:08 INFO TaskSetManager: Finished task 563.0 in stage 3.0 (TID 594) in 41401 ms on instance-2026037.ipa.ba..com (executor 5) (587/600)
19/03/04 00:10:16 INFO TaskSetManager: Starting task 582.0 in stage 3.0 (TID 600, instance-2026043.ipa.ba..com, executor 6, partition 582, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:16 INFO TaskSetManager: Finished task 565.0 in stage 3.0 (TID 595) in 41009 ms on instance-2026043.ipa.ba..com (executor 6) (588/600)
19/03/04 00:10:22 INFO TaskSetManager: Starting task 583.0 in stage 3.0 (TID 601, instance-2026045.ipa.ba..com, executor 1, partition 583, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:22 INFO TaskSetManager: Finished task 568.0 in stage 3.0 (TID 596) in 41692 ms on instance-2026045.ipa.ba..com (executor 1) (589/600)
19/03/04 00:10:29 INFO TaskSetManager: Starting task 584.0 in stage 3.0 (TID 602, instance-2026046.ipa.ba..com, executor 4, partition 584, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:29 INFO TaskSetManager: Finished task 574.0 in stage 3.0 (TID 597) in 39804 ms on instance-2026046.ipa.ba..com (executor 4) (590/600)
19/03/04 00:10:41 INFO TaskSetManager: Starting task 588.0 in stage 3.0 (TID 603, instance-2026038.ipa.ba..com, executor 2, partition 588, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:41 INFO TaskSetManager: Finished task 578.0 in stage 3.0 (TID 598) in 42417 ms on instance-2026038.ipa.ba..com (executor 2) (591/600)
19/03/04 00:10:50 INFO TaskSetManager: Starting task 590.0 in stage 3.0 (TID 604, instance-2026037.ipa.ba..com, executor 5, partition 590, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:50 INFO TaskSetManager: Finished task 579.0 in stage 3.0 (TID 599) in 41431 ms on instance-2026037.ipa.ba..com (executor 5) (592/600)
19/03/04 00:10:59 INFO TaskSetManager: Starting task 591.0 in stage 3.0 (TID 605, instance-2026043.ipa.ba..com, executor 6, partition 591, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:59 INFO TaskSetManager: Finished task 582.0 in stage 3.0 (TID 600) in 42926 ms on instance-2026043.ipa.ba..com (executor 6) (593/600)
19/03/04 00:11:04 INFO TaskSetManager: Starting task 595.0 in stage 3.0 (TID 606, instance-2026045.ipa.ba..com, executor 1, partition 595, RACK_LOCAL, 8440 bytes)
19/03/04 00:11:04 INFO TaskSetManager: Finished task 583.0 in stage 3.0 (TID 601) in 42264 ms on instance-2026045.ipa.ba..com (executor 1) (594/600)
19/03/04 00:11:10 INFO TaskSetManager: Starting task 597.0 in stage 3.0 (TID 607, instance-2026046.ipa.ba..com, executor 4, partition 597, RACK_LOCAL, 8440 bytes)
19/03/04 00:11:10 INFO TaskSetManager: Finished task 584.0 in stage 3.0 (TID 602) in 40587 ms on instance-2026046.ipa.ba..com (executor 4) (595/600)
19/03/04 00:11:25 INFO TaskSetManager: Finished task 588.0 in stage 3.0 (TID 603) in 44005 ms on instance-2026038.ipa.ba..com (executor 2) (596/600)
19/03/04 00:11:32 INFO TaskSetManager: Finished task 590.0 in stage 3.0 (TID 604) in 42027 ms on instance-2026037.ipa.ba..com (executor 5) (597/600)
19/03/04 00:11:42 INFO TaskSetManager: Finished task 591.0 in stage 3.0 (TID 605) in 43176 ms on instance-2026043.ipa.ba..com (executor 6) (598/600)
19/03/04 00:11:47 INFO TaskSetManager: Finished task 595.0 in stage 3.0 (TID 606) in 43036 ms on instance-2026045.ipa.ba..com (executor 1) (599/600)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 597.0 in stage 3.0 (TID 607) in 41030 ms on instance-2026046.ipa.ba..com (executor 4) (600/600)
19/03/04 00:11:51 INFO YarnScheduler: Removed TaskSet 3.0, whose tasks have all completed, from pool
19/03/04 00:11:51 INFO DAGScheduler: ResultStage 3 (collect at PythonRDD.scala:153) finished in 4982.168 s
19/03/04 00:11:51 INFO DAGScheduler: Job 3 finished: collect at PythonRDD.scala:153, took 4982.215393 s
2019-03-04 00:11:51,504 INFO (MainThread-75262) Stopping TensorFlow nodes
19/03/04 00:11:51 INFO SparkContext: Starting job: collect at PythonRDD.scala:153
19/03/04 00:11:51 INFO DAGScheduler: Got job 4 (collect at PythonRDD.scala:153) with 5 output partitions
19/03/04 00:11:51 INFO DAGScheduler: Final stage: ResultStage 4 (collect at PythonRDD.scala:153)
19/03/04 00:11:51 INFO DAGScheduler: Parents of final stage: List()
19/03/04 00:11:51 INFO DAGScheduler: Missing parents: List()
19/03/04 00:11:51 INFO DAGScheduler: Submitting ResultStage 4 (PythonRDD[12] at RDD at PythonRDD.scala:48), which has no missing parents
19/03/04 00:11:51 INFO MemoryStore: Block broadcast_5 stored as values in memory (estimated size 7.4 KB, free 153.4 GB)
19/03/04 00:11:51 INFO MemoryStore: Block broadcast_5_piece0 stored as bytes in memory (estimated size 4.9 KB, free 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO SparkContext: Created broadcast 5 from broadcast at DAGScheduler.scala:1039
19/03/04 00:11:51 INFO DAGScheduler: Submitting 5 missing tasks from ResultStage 4 (PythonRDD[12] at RDD at PythonRDD.scala:48) (first 15 tasks are for partitions Vector(0, 1, 2, 3, 4))
19/03/04 00:11:51 INFO YarnScheduler: Adding task set 4.0 with 5 tasks
19/03/04 00:11:51 INFO TaskSetManager: Starting task 0.0 in stage 4.0 (TID 608, instance-2026045.ipa.ba..com, executor 1, partition 0, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO TaskSetManager: Starting task 1.0 in stage 4.0 (TID 609, instance-2026037.ipa.ba..com, executor 5, partition 1, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO TaskSetManager: Starting task 2.0 in stage 4.0 (TID 610, instance-2026043.ipa.ba..com, executor 6, partition 2, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO TaskSetManager: Starting task 3.0 in stage 4.0 (TID 611, instance-2026046.ipa.ba..com, executor 4, partition 3, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO TaskSetManager: Starting task 4.0 in stage 4.0 (TID 612, instance-2026038.ipa.ba..com, executor 2, partition 4, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026038.ipa.ba..com:45476 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026043.ipa.ba..com:33242 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026045.ipa.ba..com:45485 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 4.0 in stage 4.0 (TID 612) in 37 ms on instance-2026038.ipa.ba..com (executor 2) (1/5)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 3.0 in stage 4.0 (TID 611) in 40 ms on instance-2026046.ipa.ba..com (executor 4) (2/5)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 2.0 in stage 4.0 (TID 610) in 41 ms on instance-2026043.ipa.ba..com (executor 6) (3/5)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 1.0 in stage 4.0 (TID 609) in 42 ms on instance-2026037.ipa.ba..com (executor 5) (4/5)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 0.0 in stage 4.0 (TID 608) in 44 ms on instance-2026045.ipa.ba..com (executor 1) (5/5)
19/03/04 00:11:51 INFO YarnScheduler: Removed TaskSet 4.0, whose tasks have all completed, from pool
19/03/04 00:11:51 INFO DAGScheduler: ResultStage 4 (collect at PythonRDD.scala:153) finished in 0.052 s
19/03/04 00:11:51 INFO DAGScheduler: Job 4 finished: collect at PythonRDD.scala:153, took 0.054342 s
2019-03-04 00:11:51,584 INFO (MainThread-75262) Shutting down cluster
19/03/04 00:11:54 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 2) in 4988571 ms on instance-2026039.ipa.ba..com (executor 3) (6/6)
19/03/04 00:11:54 INFO YarnScheduler: Removed TaskSet 2.0, whose tasks have all completed, from pool
19/03/04 00:11:54 INFO DAGScheduler: ResultStage 2 (foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301) finished in 4988.578 s
19/03/04 00:11:54 INFO DAGScheduler: Job 2 finished: foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301, took 4988.581628 s
19/03/04 00:11:57 INFO SparkContext: Invoking stop() from shutdown hook
19/03/04 00:11:57 INFO SparkUI: Stopped Spark web UI at http://instance-2026033.ipa.ba..com:4041
19/03/04 00:11:57 INFO YarnClientSchedulerBackend: Interrupting monitor thread
19/03/04 00:11:58 INFO YarnClientSchedulerBackend: Shutting down all executors
19/03/04 00:11:58 INFO YarnSchedulerBackend$YarnDriverEndpoint: Asking each executor to shut down
19/03/04 00:11:58 INFO SchedulerExtensionServices: Stopping SchedulerExtensionServices
(serviceOption=None,
 services=List(),
 started=false)
19/03/04 00:11:58 INFO YarnClientSchedulerBackend: Stopped
19/03/04 00:11:58 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
19/03/04 00:11:58 INFO MemoryStore: MemoryStore cleared
19/03/04 00:11:58 INFO BlockManager: BlockManager stopped
19/03/04 00:11:58 INFO BlockManagerMaster: BlockManagerMaster stopped
19/03/04 00:11:58 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
19/03/04 00:11:58 INFO SparkContext: Successfully stopped SparkContext
19/03/04 00:11:58 INFO ShutdownHookManager: Shutdown hook called
19/03/04 00:11:58 INFO ShutdownHookManager: Deleting directory /tmp/spark-2c81098d-4f91-4a1d-87ea-71ecf1c72204
19/03/04 00:11:58 INFO ShutdownHookManager: Deleting directory /tmp/spark-2c81098d-4f91-4a1d-87ea-71ecf1c72204/pyspark-6a95ff4e-93cf-4c31-ac1a-002432e73cd1
19/03/04 00:11:58 INFO ShutdownHookManager: Deleting directory /tmp/spark-3d2e5cc9-f5d1-45a4-a387-50197a773614
[Surya@..com@instance-2026033 ~]$

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions