Closed
Description
I am using a keras example with TensorflowOnSpark. I am trying to save the model file to the hdfs location I specify as args.
With no exception or errors in the Yarn Log the process have been completed, BUT I DONT SEE ANY MODEL FILE SAVED IN THE HDFS.
below is the code and the log. Please check.
Also I am not inputting any validation data in the fit_generator method. is it mandatory?
from __future__ import print_function
def main_fun(args, ctx):
import numpy
import os
import tensorflow as tf
from tensorflow.python import keras
from tensorflow.python.keras import backend as K
from tensorflow.python.keras.datasets import mnist
from tensorflow.python.keras.models import Sequential, load_model, save_model
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras.optimizers import RMSprop
from tensorflow.python.keras.callbacks import LambdaCallback, TensorBoard
from tensorflow.python.saved_model import builder as saved_model_builder
from tensorflow.python.saved_model import tag_constants
from tensorflow.python.saved_model.signature_def_utils_impl import predict_signature_def
from tensorflowonspark import TFNode
cluster, server = TFNode.start_cluster_server(ctx)
if ctx.job_name == "ps":
server.join()
elif ctx.job_name == "worker":
def generate_rdd_data(tf_feed, batch_size):
print("generate_rdd_data invoked")
while True:
batch = tf_feed.next_batch(batch_size)
feature_vector = []
lbls = []
for item in batch:
feature_vector.append(item[0])
lbls.append(item[1])
features = numpy.array(feature_vector).astype('float32')
labels = numpy.stack(lbls).astype('float32')
yield (features, labels)
with tf.device(tf.train.replica_device_setter(
worker_device="/job:worker/task:%d" % ctx.task_index,
cluster=cluster)):
batch_size = 100
num_classes = 14
# args.mode == 'spark':
x_train = tf.placeholder(tf.float32, [None, 28047], name="x_train")
y_train = tf.placeholder(tf.float32, [None, 14], name="y_train")
model = Sequential()
model.add(Dense(512, activation='relu', input_shape=(28047,)))
model.add(Dropout(0.2))
model.add(Dense(512, activation='relu'))
model.add(Dropout(0.2))
model.add(Dense(14, activation='softmax'))
model.summary()
model.compile(loss='categorical_crossentropy',
optimizer=tf.train.RMSPropOptimizer(learning_rate=0.001),
metrics=['accuracy'])
saver = tf.train.Saver()
with tf.Session(server.target) as sess:
K.set_session(sess)
def save_checkpoint(epoch, logs=None):
if epoch == 1:
tf.train.write_graph(sess.graph.as_graph_def(), args.model_dir, 'graph.pbtxt')
saver.save(sess, os.path.join(args.model_dir, 'model.ckpt'), global_step=epoch * args.steps_per_epoch)
#ckpt_callback = LambdaCallback(on_epoch_end=save_checkpoint)
#tb_callback = TensorBoard(log_dir=args.model_dir, histogram_freq=1, write_graph=True, write_images=True)
# Add callbacks to save model checkpoint and tensorboard events (on worker:0 only)
#callbacks = [ckpt_callback, tb_callback] if ctx.task_index == 0 else None
# args.input_mode == 'spark':
# train on data read from a generator which is producing data from a Spark RDD
tf_feed = TFNode.DataFeed(ctx.mgr)
model.fit_generator(generator=generate_rdd_data(tf_feed, batch_size),
steps_per_epoch=args.steps_per_epoch,
epochs=args.epochs,
verbose=1,
callbacks=None)
if args.export_dir and ctx.job_name == 'worker' and ctx.task_index == 0:
# save a local Keras model, so we can reload it with an inferencing learning_phase
save_model(model, "tmp_model")
# reload the model
K.set_learning_phase(False)
new_model = load_model("tmp_model")
# export a saved_model for inferencing
builder = saved_model_builder.SavedModelBuilder(args.export_dir)
signature = predict_signature_def(inputs={'fetures': new_model.input},
outputs={'scores': new_model.output})
builder.add_meta_graph_and_variables(sess=sess,
tags=[tag_constants.SERVING],
signature_def_map={'predict': signature},
clear_devices=True)
builder.save()
if args.input_mode == 'spark':
tf_feed.terminate()
if __name__ == '__main__':
import argparse
from pyspark.context import SparkContext
from pyspark.conf import SparkConf
from tensorflowonspark import TFCluster
import keras
sc = SparkContext(conf=SparkConf().setAppName("PhaseOneModelling"))
executors = sc._conf.get("spark.executor.instances")
num_executors = int(executors) if executors is not None else 1
num_ps = 1
parser = argparse.ArgumentParser()
parser.add_argument("--cluster_size", help="number of nodes in the cluster", type=int, default=num_executors)
parser.add_argument("--epochs", help="number of epochs of training data", type=int, default=20)
parser.add_argument("--export_dir", help="directory to export saved_model")
parser.add_argument("--data", help="HDFS path to data in parallelized CSV format")
# parser.add_argument("--input_mode", help="input mode (tf|spark)", default="tf")
parser.add_argument("--labels", help="HDFS path to MNIST labels in parallelized CSV format")
parser.add_argument("--model_dir", help="directory to write model checkpoints")
parser.add_argument("--num_ps", help="number of ps nodes", type=int, default=1)
parser.add_argument("--steps_per_epoch", help="number of steps per epoch", type=int, default=100)
parser.add_argument("--tensorboard", help="launch tensorboard process", action="store_true")
args = parser.parse_args()
print("args:", args)
data = sc.textFile(args.data)
data = data.map(lambda l: l.encode("UTF8", "ignore").split('\t'))
labels = data.map(lambda x: x[1])
data = data.map(lambda x: x[19:28066])
header = data.first()
data = data.filter(lambda line: line != header)
label_header = labels.first()
labels = labels.filter(lambda line: line != label_header)
# convert values to float
convertToFloat = lambda data: [float(str(x)) for x in data]
dataset = data.map(convertToFloat)
labels = labels.map(lambda x: float(x))
labels = labels.map(lambda x: keras.utils.to_categorical(x, num_classes=14))
# Split the data for train and validation
#testRDD, trainRDD = data.randomSplit(weights=[0.001, 0.999], seed=42)
#testlabelRDD, trainlabelRDD = labels.randomSplit(weights=[0.001, 0.999], seed=42)
dataRDD = dataset.zip(labels)
#dataRDD = dataRDD.sample(False, 0.01, 42)
#trainRDD = trainRDD.zip(trainlabelRDD)
cluster = TFCluster.run(sc, main_fun, args, args.cluster_size, args.num_ps, args.tensorboard,
TFCluster.InputMode.SPARK, log_dir=args.model_dir)
cluster.train(dataRDD, args.epochs)
cluster.shutdown()
And the log as follows:
Using TensorFlow backend.
19/03/03 22:48:13 INFO SparkContext: Running Spark version 2.3.0.2.6.5.0-292
19/03/03 22:48:13 INFO SparkContext: Submitted application: PhaseOneModelling
19/03/03 22:48:13 INFO SecurityManager: Changing view acls to: Surya@..com
19/03/03 22:48:13 INFO SecurityManager: Changing modify acls to: Surya@..com
19/03/03 22:48:13 INFO SecurityManager: Changing view acls groups to:
19/03/03 22:48:13 INFO SecurityManager: Changing modify acls groups to:
19/03/03 22:48:13 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(Surya@..com); groups with view permissions: Se t(); users with modify permissions: Set(Surya@..com); groups with modify permissions: Set()
19/03/03 22:48:13 INFO Utils: Successfully started service 'sparkDriver' on port 44164.
19/03/03 22:48:13 INFO SparkEnv: Registering MapOutputTracker
19/03/03 22:48:13 INFO SparkEnv: Registering BlockManagerMaster
19/03/03 22:48:13 INFO BlockManagerMasterEndpoint: Using org.apache.spark.storage.DefaultTopologyMapper for getting topology information
19/03/03 22:48:13 INFO BlockManagerMasterEndpoint: BlockManagerMasterEndpoint up
19/03/03 22:48:13 INFO DiskBlockManager: Created local directory at /tmp/blockmgr-966d6dac-7f6b-4411-91e2-7b4c07185c8d
19/03/03 22:48:13 INFO MemoryStore: MemoryStore started with capacity 153.4 GB
19/03/03 22:48:13 INFO SparkEnv: Registering OutputCommitCoordinator
19/03/03 22:48:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.
19/03/03 22:48:14 INFO Utils: Successfully started service 'SparkUI' on port 4041.
19/03/03 22:48:14 INFO SparkUI: Bound SparkUI to 0.0.0.0, and started at http://instance-2026033.ipa.ba..com:4041
19/03/03 22:48:15 INFO RMProxy: Connecting to ResourceManager at instance-2026030.ipa.ba..com/10.28.26.30:8050
19/03/03 22:48:15 INFO Client: Requesting a new application from cluster with 14 NodeManagers
19/03/03 22:48:15 INFO Client: Verifying our application has not requested more than the maximum memory capability of the cluster (202752 MB per container)
19/03/03 22:48:15 INFO Client: Will allocate AM container, with 896 MB memory including 384 MB overhead
19/03/03 22:48:15 INFO Client: Setting up container launch context for our AM
19/03/03 22:48:15 INFO Client: Setting up the launch environment for our AM container
19/03/03 22:48:15 INFO Client: Preparing resources for our AM container
19/03/03 22:48:15 INFO HadoopFSDelegationTokenProvider: getting token for: DFS[DFSClient[clientName=DFSClient_NONMAPREDUCE_-438946188_31, ugi=Surya@..com (auth:KERBEROS)]]
19/03/03 22:48:16 INFO KerberosName: Non-simple name Surya@..com after auth_to_local rule RULE:[1:$1@$0](.*@..com)/L
19/03/03 22:48:16 INFO DFSClient: Created HDFS_DELEGATION_TOKEN token 6651 for Surya@..com on 10.28.26.29:8020
19/03/03 22:48:19 INFO Client: Use hdfs cache file as spark.yarn.archive for HDP, hdfsCacheFile:hdfs://instance-2026029.ipa.ba..com:8020/hdp/apps/2.6.5.0-292/spark2/spark2-hdp-yarn-archive .tar.gz
19/03/03 22:48:19 INFO Client: Source and destination file systems are the same. Not copying hdfs://instance-2026029.ipa.ba..com:8020/hdp/apps/2.6.5.0-292/spark2/spark2-hdp-yarn-archive.ta r.gz
19/03/03 22:48:19 INFO Client: Uploading resource file:/usr/hdp/2.6.5.0-292/spark2/python/lib/pyspark.zip -> hdfs://instance-2026029.ipa.ba..com:8020/user/Surya@..com/.sparkStagin g/application_1551114784635_0177/pyspark.zip
19/03/03 22:48:19 INFO Client: Uploading resource file:/usr/hdp/2.6.5.0-292/spark2/python/lib/py4j-0.10.6-src.zip -> hdfs://instance-2026029.ipa.ba..com:8020/user/Surya@..com/.spa rkStaging/application_1551114784635_0177/py4j-0.10.6-src.zip
19/03/03 22:48:19 INFO Client: Uploading resource file:/tmp/spark-2c81098d-4f91-4a1d-87ea-71ecf1c72204/__spark_conf__626918427791303293.zip -> hdfs://instance-2026029.ipa.ba..com:8020/user /Surya@..com/.sparkStaging/application_1551114784635_0177/__spark_conf__.zip
19/03/03 22:48:19 INFO SecurityManager: Changing view acls to: Surya@..com
19/03/03 22:48:19 INFO SecurityManager: Changing modify acls to: Surya@..com
19/03/03 22:48:19 INFO SecurityManager: Changing view acls groups to:
19/03/03 22:48:19 INFO SecurityManager: Changing modify acls groups to:
19/03/03 22:48:19 INFO SecurityManager: SecurityManager: authentication disabled; ui acls disabled; users with view permissions: Set(Surya@..com); groups with view permissions: Se t(); users with modify permissions: Set(Surya@..com); groups with modify permissions: Set()
19/03/03 22:48:19 INFO Client: Submitting application application_1551114784635_0177 to ResourceManager
19/03/03 22:48:22 INFO YarnClientImpl: Submitted application application_1551114784635_0177
19/03/03 22:48:22 INFO SchedulerExtensionServices: Starting Yarn extension services with app application_1551114784635_0177 and attemptId None
19/03/03 22:48:23 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:23 INFO Client:
client token: Token { kind: YARN_CLIENT_TOKEN, service: }
diagnostics: AM container is launched, waiting for AM container to Register with RM
ApplicationMaster host: N/A
ApplicationMaster RPC port: -1
queue: production
start time: 1551671302538
final status: UNDEFINED
tracking URL: https://instance-2026030.ipa.ba..com:8090/proxy/application_1551114784635_0177/
user: Surya@..com
19/03/03 22:48:24 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:25 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:26 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:27 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:28 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:29 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:30 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:31 INFO YarnClientSchedulerBackend: Add WebUI Filter. org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter, Map(PROXY_HOSTS -> instance-2026030.ipa.ba..com, PROXY_URI_ BASES -> https://instance-2026030.ipa.ba..com:8090/proxy/application_1551114784635_0177), /proxy/application_1551114784635_0177
19/03/03 22:48:31 INFO JettyUtils: Adding filter: org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter
19/03/03 22:48:31 INFO Client: Application report for application_1551114784635_0177 (state: ACCEPTED)
19/03/03 22:48:31 INFO YarnSchedulerBackend$YarnSchedulerEndpoint: ApplicationMaster registered as NettyRpcEndpointRef(spark-client://YarnAM)
19/03/03 22:48:32 INFO Client: Application report for application_1551114784635_0177 (state: RUNNING)
19/03/03 22:48:32 INFO Client:
client token: Token { kind: YARN_CLIENT_TOKEN, service: }
diagnostics: N/A
ApplicationMaster host: 10.28.26.40
ApplicationMaster RPC port: 0
queue: production
start time: 1551671302538
final status: UNDEFINED
tracking URL: https://instance-2026030.ipa.ba..com:8090/proxy/application_1551114784635_0177/
user: Surya@..com
19/03/03 22:48:32 INFO YarnClientSchedulerBackend: Application application_1551114784635_0177 has started running.
19/03/03 22:48:32 INFO Utils: Successfully started service 'org.apache.spark.network.netty.NettyBlockTransferService' on port 38256.
19/03/03 22:48:32 INFO NettyBlockTransferService: Server created on instance-2026033.ipa.ba..com:38256
19/03/03 22:48:32 INFO BlockManager: Using org.apache.spark.storage.RandomBlockReplicationPolicy for block replication policy
19/03/03 22:48:32 INFO BlockManagerMaster: Registering BlockManager BlockManagerId(driver, instance-2026033.ipa.ba..com, 38256, None)
19/03/03 22:48:32 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026033.ipa.ba..com:38256 with 153.4 GB RAM, BlockManagerId(driver, instance-2026033.ipa.ba..com, 382 56, None)
19/03/03 22:48:32 INFO BlockManagerMaster: Registered BlockManager BlockManagerId(driver, instance-2026033.ipa.ba..com, 38256, None)
19/03/03 22:48:32 INFO BlockManager: Initialized BlockManager: BlockManagerId(driver, instance-2026033.ipa.ba..com, 38256, None)
19/03/03 22:48:33 INFO EventLoggingListener: Logging events to hdfs:/spark2-history/application_1551114784635_0177
19/03/03 22:48:38 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.39:36672) with ID 3
19/03/03 22:48:38 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026039.ipa.ba..com:33711 with 153.4 GB RAM, BlockManagerId(3, instance-2026039.ipa.ba..com, 33711, N one)
19/03/03 22:48:39 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.38:49710) with ID 2
19/03/03 22:48:39 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026038.ipa.ba..com:45476 with 153.4 GB RAM, BlockManagerId(2, instance-2026038.ipa.ba..com, 45476, N one)
19/03/03 22:48:39 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.45:36844) with ID 1
19/03/03 22:48:39 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026045.ipa.ba..com:45485 with 153.4 GB RAM, BlockManagerId(1, instance-2026045.ipa.ba..com, 45485, N one)
19/03/03 22:48:39 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.46:58368) with ID 4
19/03/03 22:48:39 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.43:54760) with ID 6
19/03/03 22:48:39 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026046.ipa.ba..com:41346 with 153.4 GB RAM, BlockManagerId(4, instance-2026046.ipa.ba..com, 41346, N one)
19/03/03 22:48:40 INFO YarnClientSchedulerBackend: SchedulerBackend is ready for scheduling beginning after reached minRegisteredResourcesRatio: 0.8
19/03/03 22:48:40 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026043.ipa.ba..com:33242 with 153.4 GB RAM, BlockManagerId(6, instance-2026043.ipa.ba..com, 33242, N one)
args: Namespace(cluster_size=6, data='/user/imagen.admins/NormalizedAugustData/Wide/ReducedFeatures/wide_august_tf_idf_normalized_with_col_ReducedFeatures.tsv', epochs=5, export_dir='/tmp/m ss/TensorflowOnSpark/export_dir/', labels=None, model_dir='/tmp/mss/TensorflowOnSpark/model_dir', num_ps=1, steps_per_epoch=2622, tensorboard=False)
19/03/03 22:48:40 INFO MemoryStore: Block broadcast_0 stored as values in memory (estimated size 466.7 KB, free 153.4 GB)
19/03/03 22:48:40 INFO MemoryStore: Block broadcast_0_piece0 stored as bytes in memory (estimated size 34.0 KB, free 153.4 GB)
19/03/03 22:48:40 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:40 INFO SparkContext: Created broadcast 0 from textFile at NativeMethodAccessorImpl.java:0
19/03/03 22:48:40 INFO KerberosName: Non-simple name Surya@..com after auth_to_local rule RULE:[1:$1@$0](.*@..com)/L
19/03/03 22:48:40 INFO DFSClient: Created HDFS_DELEGATION_TOKEN token 6652 for Surya@..com on 10.28.26.29:8020
19/03/03 22:48:40 INFO TokenCache: Got dt for hdfs://instance-2026029.ipa.ba..com:8020; Kind: HDFS_DELEGATION_TOKEN, Service: 10.28.26.29:8020, Ident: (HDFS_DELEGATION_TOKEN token 6652 fo r Surya@..com)
19/03/03 22:48:40 INFO FileInputFormat: Total input paths to process : 1
19/03/03 22:48:40 INFO SparkContext: Starting job: runJob at PythonRDD.scala:141
19/03/03 22:48:40 INFO DAGScheduler: Got job 0 (runJob at PythonRDD.scala:141) with 1 output partitions
19/03/03 22:48:40 INFO DAGScheduler: Final stage: ResultStage 0 (runJob at PythonRDD.scala:141)
19/03/03 22:48:40 INFO DAGScheduler: Parents of final stage: List()
19/03/03 22:48:40 INFO DAGScheduler: Missing parents: List()
19/03/03 22:48:40 INFO DAGScheduler: Submitting ResultStage 0 (PythonRDD[2] at RDD at PythonRDD.scala:48), which has no missing parents
19/03/03 22:48:40 INFO MemoryStore: Block broadcast_1 stored as values in memory (estimated size 6.4 KB, free 153.4 GB)
19/03/03 22:48:40 INFO MemoryStore: Block broadcast_1_piece0 stored as bytes in memory (estimated size 4.0 KB, free 153.4 GB)
19/03/03 22:48:40 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 4.0 KB, free: 153.4 GB)
19/03/03 22:48:40 INFO SparkContext: Created broadcast 1 from broadcast at DAGScheduler.scala:1039
19/03/03 22:48:40 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 0 (PythonRDD[2] at RDD at PythonRDD.scala:48) (first 15 tasks are for partitions Vector(0))
19/03/03 22:48:40 INFO YarnScheduler: Adding task set 0.0 with 1 tasks
19/03/03 22:48:40 INFO TaskSetManager: Starting task 0.0 in stage 0.0 (TID 0, instance-2026046.ipa.ba..com, executor 4, partition 0, RACK_LOCAL, 8020 bytes)
19/03/03 22:48:41 INFO YarnSchedulerBackend$YarnDriverEndpoint: Registered executor NettyRpcEndpointRef(spark-client://Executor) (10.28.26.37:34390) with ID 5
19/03/03 22:48:41 INFO BlockManagerMasterEndpoint: Registering block manager instance-2026037.ipa.ba..com:45022 with 153.4 GB RAM, BlockManagerId(5, instance-2026037.ipa.ba..com, 45022, N one)
19/03/03 22:48:41 INFO BlockManagerInfo: Added broadcast_1_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 4.0 KB, free: 153.4 GB)
19/03/03 22:48:41 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:43 INFO TaskSetManager: Finished task 0.0 in stage 0.0 (TID 0) in 2887 ms on instance-2026046.ipa.ba..com (executor 4) (1/1)
19/03/03 22:48:43 INFO YarnScheduler: Removed TaskSet 0.0, whose tasks have all completed, from pool
19/03/03 22:48:43 INFO DAGScheduler: ResultStage 0 (runJob at PythonRDD.scala:141) finished in 3.100 s
19/03/03 22:48:43 INFO DAGScheduler: Job 0 finished: runJob at PythonRDD.scala:141, took 3.163148 s
19/03/03 22:48:43 INFO SparkContext: Starting job: runJob at PythonRDD.scala:141
19/03/03 22:48:43 INFO DAGScheduler: Got job 1 (runJob at PythonRDD.scala:141) with 1 output partitions
19/03/03 22:48:43 INFO DAGScheduler: Final stage: ResultStage 1 (runJob at PythonRDD.scala:141)
19/03/03 22:48:43 INFO DAGScheduler: Parents of final stage: List()
19/03/03 22:48:43 INFO DAGScheduler: Missing parents: List()
19/03/03 22:48:43 INFO DAGScheduler: Submitting ResultStage 1 (PythonRDD[3] at RDD at PythonRDD.scala:48), which has no missing parents
19/03/03 22:48:43 INFO MemoryStore: Block broadcast_2 stored as values in memory (estimated size 6.3 KB, free 153.4 GB)
19/03/03 22:48:43 INFO MemoryStore: Block broadcast_2_piece0 stored as bytes in memory (estimated size 4.0 KB, free 153.4 GB)
19/03/03 22:48:43 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 4.0 KB, free: 153.4 GB)
19/03/03 22:48:43 INFO SparkContext: Created broadcast 2 from broadcast at DAGScheduler.scala:1039
19/03/03 22:48:43 INFO DAGScheduler: Submitting 1 missing tasks from ResultStage 1 (PythonRDD[3] at RDD at PythonRDD.scala:48) (first 15 tasks are for partitions Vector(0))
19/03/03 22:48:43 INFO YarnScheduler: Adding task set 1.0 with 1 tasks
19/03/03 22:48:43 INFO TaskSetManager: Starting task 0.0 in stage 1.0 (TID 1, instance-2026037.ipa.ba..com, executor 5, partition 0, NODE_LOCAL, 8020 bytes)
19/03/03 22:48:44 INFO BlockManagerInfo: Added broadcast_2_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 4.0 KB, free: 153.4 GB)
19/03/03 22:48:44 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:45 INFO TaskSetManager: Finished task 0.0 in stage 1.0 (TID 1) in 2121 ms on instance-2026037.ipa.ba..com (executor 5) (1/1)
19/03/03 22:48:45 INFO YarnScheduler: Removed TaskSet 1.0, whose tasks have all completed, from pool
19/03/03 22:48:45 INFO DAGScheduler: ResultStage 1 (runJob at PythonRDD.scala:141) finished in 2.129 s
19/03/03 22:48:45 INFO DAGScheduler: Job 1 finished: runJob at PythonRDD.scala:141, took 2.132856 s
2019-03-03 22:48:46,218 INFO (MainThread-75262) Reserving TFSparkNodes
2019-03-03 22:48:46,218 INFO (MainThread-75262) cluster_template: {'ps': [0], 'worker': [1, 2, 3, 4, 5]}
2019-03-03 22:48:46,219 INFO (MainThread-75262) listening for reservations at ('10.28.26.33', 42213)
2019-03-03 22:48:46,219 INFO (MainThread-75262) Starting TensorFlow on executors
2019-03-03 22:48:46,224 INFO (MainThread-75262) Waiting for TFSparkNodes to start
2019-03-03 22:48:46,224 INFO (MainThread-75262) waiting for 6 reservations
19/03/03 22:48:46 INFO SparkContext: Starting job: foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301
19/03/03 22:48:46 INFO DAGScheduler: Got job 2 (foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301) with 6 output partitions
19/03/03 22:48:46 INFO DAGScheduler: Final stage: ResultStage 2 (foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301)
19/03/03 22:48:46 INFO DAGScheduler: Parents of final stage: List()
19/03/03 22:48:46 INFO DAGScheduler: Missing parents: List()
19/03/03 22:48:46 INFO DAGScheduler: Submitting ResultStage 2 (PythonRDD[8] at foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301), which has no missing parents
19/03/03 22:48:46 INFO MemoryStore: Block broadcast_3 stored as values in memory (estimated size 17.4 KB, free 153.4 GB)
19/03/03 22:48:46 INFO MemoryStore: Block broadcast_3_piece0 stored as bytes in memory (estimated size 12.5 KB, free 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO SparkContext: Created broadcast 3 from broadcast at DAGScheduler.scala:1039
19/03/03 22:48:46 INFO DAGScheduler: Submitting 6 missing tasks from ResultStage 2 (PythonRDD[8] at foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301) (first 15 tasks are for partitions Vector(0, 1, 2, 3, 4, 5))
19/03/03 22:48:46 INFO YarnScheduler: Adding task set 2.0 with 6 tasks
19/03/03 22:48:46 INFO TaskSetManager: Starting task 0.0 in stage 2.0 (TID 2, instance-2026039.ipa.ba..com, executor 3, partition 0, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 1.0 in stage 2.0 (TID 3, instance-2026046.ipa.ba..com, executor 4, partition 1, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 2.0 in stage 2.0 (TID 4, instance-2026038.ipa.ba..com, executor 2, partition 2, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 3.0 in stage 2.0 (TID 5, instance-2026037.ipa.ba..com, executor 5, partition 3, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 4.0 in stage 2.0 (TID 6, instance-2026043.ipa.ba..com, executor 6, partition 4, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO TaskSetManager: Starting task 5.0 in stage 2.0 (TID 7, instance-2026045.ipa.ba..com, executor 1, partition 5, PROCESS_LOCAL, 7869 bytes)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026039.ipa.ba..com:33711 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026038.ipa.ba..com:45476 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026045.ipa.ba..com:45485 (size: 12.5 KB, free: 153.4 GB)
19/03/03 22:48:46 INFO BlockManagerInfo: Added broadcast_3_piece0 in memory on instance-2026043.ipa.ba..com:33242 (size: 12.5 KB, free: 153.4 GB)
2019-03-03 22:48:47,226 INFO (MainThread-75262) waiting for 6 reservations
2019-03-03 22:48:48,226 INFO (MainThread-75262) waiting for 4 reservations
2019-03-03 22:48:49,228 INFO (MainThread-75262) all reservations completed
2019-03-03 22:48:49,228 INFO (MainThread-75262) All TFSparkNodes started
2019-03-03 22:48:49,228 INFO (MainThread-75262) {'executor_id': 3, 'addr': '/tmp/pymp-Nm3GK_/listener-psN9ka', 'task_index': 2, 'job_name': 'worker', 'authkey': '\x19\x83\xec\x1e\xacNM\x16\ x89\xc0\xa0,\x9b\x87$\xd3', 'host': '10.28.26.37', 'port': 34616, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,228 INFO (MainThread-75262) {'executor_id': 1, 'addr': '/tmp/pymp-anFXbv/listener-gYfcyl', 'task_index': 0, 'job_name': 'worker', 'authkey': '\xca\xfe\xf8+k\xbfC\n\xbfI\ x19\xc7=\xefR\x17', 'host': '10.28.26.46', 'port': 35676, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,228 INFO (MainThread-75262) {'executor_id': 2, 'addr': '/tmp/pymp-Ws00Ww/listener-J06992', 'task_index': 1, 'job_name': 'worker', 'authkey': '?\xc8\xef\xde\x98\xb3EB\x8f O\x80\x89\xeb\xff\x83\x91', 'host': '10.28.26.38', 'port': 34594, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,228 INFO (MainThread-75262) {'executor_id': 0, 'addr': ('10.28.26.39', 45967), 'task_index': 0, 'job_name': 'ps', 'authkey': '\xce\x1b$\xbeg6@T\xb0q\xb8I\x04\xc5\x1a\r' , 'host': '10.28.26.39', 'port': 39537, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,229 INFO (MainThread-75262) {'executor_id': 4, 'addr': '/tmp/pymp-9cxRDb/listener-Vfnobx', 'task_index': 3, 'job_name': 'worker', 'authkey': '\xad\xf6\x10\xb6\x9f\x13O\x c1\xbb\xb0\xd6\x85\xb3e\x16\xee', 'host': '10.28.26.43', 'port': 35727, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,229 INFO (MainThread-75262) {'executor_id': 5, 'addr': '/tmp/pymp-C7cs51/listener-KKXXmn', 'task_index': 4, 'job_name': 'worker', 'authkey': ';\xbe"\xd73\xcdLD\xa559Z\xc b{\x92\x92', 'host': '10.28.26.45', 'port': 46460, 'tb_pid': 0, 'tb_port': 0}
2019-03-03 22:48:49,229 INFO (MainThread-75262) Feeding training data
19/03/03 22:48:49 INFO SparkContext: Starting job: collect at PythonRDD.scala:153
19/03/03 22:48:49 INFO DAGScheduler: Got job 3 (collect at PythonRDD.scala:153) with 600 output partitions
19/03/03 22:48:49 INFO DAGScheduler: Final stage: ResultStage 3 (collect at PythonRDD.scala:153)
19/03/03 22:48:49 INFO DAGScheduler: Parents of final stage: List()
19/03/03 22:48:49 INFO DAGScheduler: Missing parents: List()
19/03/03 22:48:49 INFO DAGScheduler: Submitting ResultStage 3 (PythonRDD[10] at RDD at PythonRDD.scala:48), which has no missing parents
19/03/03 22:48:49 INFO MemoryStore: Block broadcast_4 stored as values in memory (estimated size 389.2 KB, free 153.4 GB)
19/03/03 22:48:49 INFO MemoryStore: Block broadcast_4_piece0 stored as bytes in memory (estimated size 242.0 KB, free 153.4 GB)
19/03/03 22:48:49 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:49 INFO SparkContext: Created broadcast 4 from broadcast at DAGScheduler.scala:1039
19/03/03 22:48:49 INFO DAGScheduler: Submitting 600 missing tasks from ResultStage 3 (PythonRDD[10] at RDD at PythonRDD.scala:48) (first 15 tasks are for partitions Vector(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14))
19/03/03 22:48:49 INFO YarnScheduler: Adding task set 3.0 with 600 tasks
19/03/03 22:48:49 INFO TaskSetManager: Finished task 5.0 in stage 2.0 (TID 7) in 3490 ms on instance-2026045.ipa.ba..com (executor 1) (1/6)
19/03/03 22:48:49 INFO TaskSetManager: Starting task 0.0 in stage 3.0 (TID 8, instance-2026037.ipa.ba..com, executor 5, partition 0, NODE_LOCAL, 8440 bytes)
19/03/03 22:48:49 INFO TaskSetManager: Finished task 3.0 in stage 2.0 (TID 5) in 3569 ms on instance-2026037.ipa.ba..com (executor 5) (2/6)
19/03/03 22:48:49 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:50 INFO TaskSetManager: Finished task 1.0 in stage 2.0 (TID 3) in 3867 ms on instance-2026046.ipa.ba..com (executor 4) (3/6)
19/03/03 22:48:50 INFO TaskSetManager: Starting task 3.0 in stage 3.0 (TID 9, instance-2026038.ipa.ba..com, executor 2, partition 3, NODE_LOCAL, 8440 bytes)
19/03/03 22:48:50 INFO TaskSetManager: Finished task 2.0 in stage 2.0 (TID 4) in 4240 ms on instance-2026038.ipa.ba..com (executor 2) (4/6)
19/03/03 22:48:50 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026038.ipa.ba..com:45476 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:50 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026038.ipa.ba..com:45476 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:50 INFO TaskSetManager: Starting task 1.0 in stage 3.0 (TID 10, instance-2026043.ipa.ba..com, executor 6, partition 1, NODE_LOCAL, 8440 bytes)
19/03/03 22:48:50 INFO TaskSetManager: Finished task 4.0 in stage 2.0 (TID 6) in 4423 ms on instance-2026043.ipa.ba..com (executor 6) (5/6)
19/03/03 22:48:50 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026043.ipa.ba..com:33242 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:50 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026043.ipa.ba..com:33242 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:48:54 INFO TaskSetManager: Starting task 2.0 in stage 3.0 (TID 11, instance-2026046.ipa.ba..com, executor 4, partition 2, RACK_LOCAL, 8440 bytes)
19/03/03 22:48:54 INFO TaskSetManager: Starting task 4.0 in stage 3.0 (TID 12, instance-2026045.ipa.ba..com, executor 1, partition 4, RACK_LOCAL, 8440 bytes)
19/03/03 22:48:54 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026045.ipa.ba..com:45485 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:54 INFO BlockManagerInfo: Added broadcast_4_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 242.0 KB, free: 153.4 GB)
19/03/03 22:48:54 INFO BlockManagerInfo: Added broadcast_0_piece0 in memory on instance-2026045.ipa.ba..com:45485 (size: 34.0 KB, free: 153.4 GB)
19/03/03 22:49:35 INFO TaskSetManager: Starting task 5.0 in stage 3.0 (TID 13, instance-2026038.ipa.ba..com, executor 2, partition 5, NODE_LOCAL, 8440 bytes)
19/03/03 22:49:35 INFO TaskSetManager: Finished task 3.0 in stage 3.0 (TID 9) in 45462 ms on instance-2026038.ipa.ba..com (executor 2) (1/600)
19/03/03 22:49:36 INFO TaskSetManager: Starting task 6.0 in stage 3.0 (TID 14, instance-2026037.ipa.ba..com, executor 5, partition 6, NODE_LOCAL, 8440 bytes)
19/03/03 22:49:36 INFO TaskSetManager: Finished task 0.0 in stage 3.0 (TID 8) in 46239 ms on instance-2026037.ipa.ba..com (executor 5) (2/600)
19/03/03 22:49:36 INFO TaskSetManager: Starting task 7.0 in stage 3.0 (TID 15, instance-2026043.ipa.ba..com, executor 6, partition 7, NODE_LOCAL, 8440 bytes)
19/03/03 22:49:36 INFO TaskSetManager: Finished task 1.0 in stage 3.0 (TID 10) in 45790 ms on instance-2026043.ipa.ba..com (executor 6) (3/600)
19/03/03 22:49:39 INFO TaskSetManager: Finished task 2.0 in stage 3.0 (TID 11) in 44513 ms on instance-2026046.ipa.ba..com (executor 4) (4/600)
19/03/03 22:49:39 INFO TaskSetManager: Starting task 8.0 in stage 3.0 (TID 16, instance-2026046.ipa.ba..com, executor 4, partition 8, RACK_LOCAL, 8440 bytes)
19/03/03 22:49:40 INFO TaskSetManager: Starting task 9.0 in stage 3.0 (TID 17, instance-2026045.ipa.ba..com, executor 1, partition 9, RACK_LOCAL, 8440 bytes)
19/03/03 22:49:40 INFO TaskSetManager: Finished task 4.0 in stage 3.0 (TID 12) in 45791 ms on instance-2026045.ipa.ba..com (executor 1) (5/600)
19/03/03 22:50:15 INFO TaskSetManager: Starting task 21.0 in stage 3.0 (TID 18, instance-2026038.ipa.ba..com, executor 2, partition 21, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:15 INFO TaskSetManager: Finished task 5.0 in stage 3.0 (TID 13) in 39597 ms on instance-2026038.ipa.ba..com (executor 2) (6/600)
19/03/03 22:50:16 INFO TaskSetManager: Starting task 11.0 in stage 3.0 (TID 19, instance-2026043.ipa.ba..com, executor 6, partition 11, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:16 INFO TaskSetManager: Finished task 7.0 in stage 3.0 (TID 15) in 39619 ms on instance-2026043.ipa.ba..com (executor 6) (7/600)
19/03/03 22:50:17 INFO TaskSetManager: Starting task 10.0 in stage 3.0 (TID 20, instance-2026037.ipa.ba..com, executor 5, partition 10, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:17 INFO TaskSetManager: Finished task 6.0 in stage 3.0 (TID 14) in 41346 ms on instance-2026037.ipa.ba..com (executor 5) (8/600)
19/03/03 22:50:19 INFO TaskSetManager: Finished task 8.0 in stage 3.0 (TID 16) in 39872 ms on instance-2026046.ipa.ba..com (executor 4) (9/600)
19/03/03 22:50:20 INFO TaskSetManager: Starting task 12.0 in stage 3.0 (TID 21, instance-2026046.ipa.ba..com, executor 4, partition 12, RACK_LOCAL, 8440 bytes)
19/03/03 22:50:24 INFO TaskSetManager: Starting task 13.0 in stage 3.0 (TID 22, instance-2026045.ipa.ba..com, executor 1, partition 13, RACK_LOCAL, 8440 bytes)
19/03/03 22:50:24 INFO TaskSetManager: Finished task 9.0 in stage 3.0 (TID 17) in 43885 ms on instance-2026045.ipa.ba..com (executor 1) (10/600)
19/03/03 22:50:54 INFO TaskSetManager: Starting task 26.0 in stage 3.0 (TID 23, instance-2026038.ipa.ba..com, executor 2, partition 26, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:54 INFO TaskSetManager: Finished task 21.0 in stage 3.0 (TID 18) in 39348 ms on instance-2026038.ipa.ba..com (executor 2) (11/600)
19/03/03 22:50:55 INFO TaskSetManager: Starting task 14.0 in stage 3.0 (TID 24, instance-2026043.ipa.ba..com, executor 6, partition 14, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:55 INFO TaskSetManager: Finished task 11.0 in stage 3.0 (TID 19) in 39112 ms on instance-2026043.ipa.ba..com (executor 6) (12/600)
19/03/03 22:50:58 INFO TaskSetManager: Starting task 24.0 in stage 3.0 (TID 25, instance-2026037.ipa.ba..com, executor 5, partition 24, NODE_LOCAL, 8440 bytes)
19/03/03 22:50:58 INFO TaskSetManager: Finished task 10.0 in stage 3.0 (TID 20) in 41423 ms on instance-2026037.ipa.ba..com (executor 5) (13/600)
19/03/03 22:51:00 INFO TaskSetManager: Finished task 12.0 in stage 3.0 (TID 21) in 39971 ms on instance-2026046.ipa.ba..com (executor 4) (14/600)
19/03/03 22:51:02 INFO TaskSetManager: Starting task 15.0 in stage 3.0 (TID 26, instance-2026046.ipa.ba..com, executor 4, partition 15, RACK_LOCAL, 8440 bytes)
19/03/03 22:51:03 INFO TaskSetManager: Starting task 16.0 in stage 3.0 (TID 27, instance-2026045.ipa.ba..com, executor 1, partition 16, RACK_LOCAL, 8440 bytes)
19/03/03 22:51:03 INFO TaskSetManager: Finished task 13.0 in stage 3.0 (TID 22) in 39738 ms on instance-2026045.ipa.ba..com (executor 1) (15/600)
19/03/03 22:51:34 INFO TaskSetManager: Starting task 32.0 in stage 3.0 (TID 28, instance-2026043.ipa.ba..com, executor 6, partition 32, NODE_LOCAL, 8440 bytes)
19/03/03 22:51:34 INFO TaskSetManager: Finished task 14.0 in stage 3.0 (TID 24) in 39273 ms on instance-2026043.ipa.ba..com (executor 6) (16/600)
19/03/03 22:51:34 INFO TaskSetManager: Starting task 31.0 in stage 3.0 (TID 29, instance-2026038.ipa.ba..com, executor 2, partition 31, NODE_LOCAL, 8440 bytes)
19/03/03 22:51:34 INFO TaskSetManager: Finished task 26.0 in stage 3.0 (TID 23) in 39756 ms on instance-2026038.ipa.ba..com (executor 2) (17/600)
19/03/03 22:51:40 INFO TaskSetManager: Starting task 25.0 in stage 3.0 (TID 30, instance-2026037.ipa.ba..com, executor 5, partition 25, NODE_LOCAL, 8440 bytes)
19/03/03 22:51:40 INFO TaskSetManager: Finished task 24.0 in stage 3.0 (TID 25) in 41287 ms on instance-2026037.ipa.ba..com (executor 5) (18/600)
19/03/03 22:51:42 INFO TaskSetManager: Finished task 15.0 in stage 3.0 (TID 26) in 39507 ms on instance-2026046.ipa.ba..com (executor 4) (19/600)
19/03/03 22:51:43 INFO TaskSetManager: Starting task 17.0 in stage 3.0 (TID 31, instance-2026045.ipa.ba..com, executor 1, partition 17, RACK_LOCAL, 8440 bytes)
19/03/03 22:51:43 INFO TaskSetManager: Finished task 16.0 in stage 3.0 (TID 27) in 39474 ms on instance-2026045.ipa.ba..com (executor 1) (20/600)
19/03/04 00:07:54 INFO TaskSetManager: Starting task 541.0 in stage 3.0 (TID 583, instance-2026038.ipa.ba..com, executor 2, partition 541, RACK_LOCAL, 8440 bytes)
19/03/04 00:07:54 INFO TaskSetManager: Finished task 533.0 in stage 3.0 (TID 579) in 40271 ms on instance-2026038.ipa.ba..com (executor 2) (571/600)
19/03/04 00:07:57 INFO TaskSetManager: Starting task 543.0 in stage 3.0 (TID 584, instance-2026037.ipa.ba..com, executor 5, partition 543, RACK_LOCAL, 8440 bytes)
19/03/04 00:07:57 INFO TaskSetManager: Finished task 532.0 in stage 3.0 (TID 578) in 44489 ms on instance-2026037.ipa.ba..com (executor 5) (572/600)
19/03/04 00:08:12 INFO TaskSetManager: Starting task 544.0 in stage 3.0 (TID 585, instance-2026043.ipa.ba..com, executor 6, partition 544, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:12 INFO TaskSetManager: Finished task 534.0 in stage 3.0 (TID 580) in 40645 ms on instance-2026043.ipa.ba..com (executor 6) (573/600)
19/03/04 00:08:17 INFO TaskSetManager: Starting task 545.0 in stage 3.0 (TID 586, instance-2026045.ipa.ba..com, executor 1, partition 545, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:17 INFO TaskSetManager: Finished task 535.0 in stage 3.0 (TID 581) in 40536 ms on instance-2026045.ipa.ba..com (executor 1) (574/600)
19/03/04 00:08:31 INFO TaskSetManager: Starting task 548.0 in stage 3.0 (TID 587, instance-2026046.ipa.ba..com, executor 4, partition 548, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:31 INFO TaskSetManager: Finished task 540.0 in stage 3.0 (TID 582) in 38886 ms on instance-2026046.ipa.ba..com (executor 4) (575/600)
19/03/04 00:08:36 INFO TaskSetManager: Starting task 551.0 in stage 3.0 (TID 588, instance-2026038.ipa.ba..com, executor 2, partition 551, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:36 INFO TaskSetManager: Finished task 541.0 in stage 3.0 (TID 583) in 41700 ms on instance-2026038.ipa.ba..com (executor 2) (576/600)
19/03/04 00:08:41 INFO TaskSetManager: Starting task 554.0 in stage 3.0 (TID 589, instance-2026037.ipa.ba..com, executor 5, partition 554, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:41 INFO TaskSetManager: Finished task 543.0 in stage 3.0 (TID 584) in 44824 ms on instance-2026037.ipa.ba..com (executor 5) (577/600)
19/03/04 00:08:54 INFO TaskSetManager: Starting task 555.0 in stage 3.0 (TID 590, instance-2026043.ipa.ba..com, executor 6, partition 555, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:54 INFO TaskSetManager: Finished task 544.0 in stage 3.0 (TID 585) in 41618 ms on instance-2026043.ipa.ba..com (executor 6) (578/600)
19/03/04 00:08:58 INFO TaskSetManager: Starting task 558.0 in stage 3.0 (TID 591, instance-2026045.ipa.ba..com, executor 1, partition 558, RACK_LOCAL, 8440 bytes)
19/03/04 00:08:58 INFO TaskSetManager: Finished task 545.0 in stage 3.0 (TID 586) in 40567 ms on instance-2026045.ipa.ba..com (executor 1) (579/600)
19/03/04 00:09:10 INFO TaskSetManager: Starting task 559.0 in stage 3.0 (TID 592, instance-2026046.ipa.ba..com, executor 4, partition 559, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:10 INFO TaskSetManager: Finished task 548.0 in stage 3.0 (TID 587) in 39338 ms on instance-2026046.ipa.ba..com (executor 4) (580/600)
19/03/04 00:09:17 INFO TaskSetManager: Starting task 562.0 in stage 3.0 (TID 593, instance-2026038.ipa.ba..com, executor 2, partition 562, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:17 INFO TaskSetManager: Finished task 551.0 in stage 3.0 (TID 588) in 41027 ms on instance-2026038.ipa.ba..com (executor 2) (581/600)
19/03/04 00:09:27 INFO TaskSetManager: Starting task 563.0 in stage 3.0 (TID 594, instance-2026037.ipa.ba..com, executor 5, partition 563, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:27 INFO TaskSetManager: Finished task 554.0 in stage 3.0 (TID 589) in 45716 ms on instance-2026037.ipa.ba..com (executor 5) (582/600)
19/03/04 00:09:35 INFO TaskSetManager: Starting task 565.0 in stage 3.0 (TID 595, instance-2026043.ipa.ba..com, executor 6, partition 565, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:35 INFO TaskSetManager: Finished task 555.0 in stage 3.0 (TID 590) in 41138 ms on instance-2026043.ipa.ba..com (executor 6) (583/600)
19/03/04 00:09:40 INFO TaskSetManager: Starting task 568.0 in stage 3.0 (TID 596, instance-2026045.ipa.ba..com, executor 1, partition 568, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:40 INFO TaskSetManager: Finished task 558.0 in stage 3.0 (TID 591) in 42608 ms on instance-2026045.ipa.ba..com (executor 1) (584/600)
19/03/04 00:09:50 INFO TaskSetManager: Starting task 574.0 in stage 3.0 (TID 597, instance-2026046.ipa.ba..com, executor 4, partition 574, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:50 INFO TaskSetManager: Finished task 559.0 in stage 3.0 (TID 592) in 39112 ms on instance-2026046.ipa.ba..com (executor 4) (585/600)
19/03/04 00:09:58 INFO TaskSetManager: Starting task 578.0 in stage 3.0 (TID 598, instance-2026038.ipa.ba..com, executor 2, partition 578, RACK_LOCAL, 8440 bytes)
19/03/04 00:09:58 INFO TaskSetManager: Finished task 562.0 in stage 3.0 (TID 593) in 41681 ms on instance-2026038.ipa.ba..com (executor 2) (586/600)
19/03/04 00:10:08 INFO TaskSetManager: Starting task 579.0 in stage 3.0 (TID 599, instance-2026037.ipa.ba..com, executor 5, partition 579, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:08 INFO TaskSetManager: Finished task 563.0 in stage 3.0 (TID 594) in 41401 ms on instance-2026037.ipa.ba..com (executor 5) (587/600)
19/03/04 00:10:16 INFO TaskSetManager: Starting task 582.0 in stage 3.0 (TID 600, instance-2026043.ipa.ba..com, executor 6, partition 582, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:16 INFO TaskSetManager: Finished task 565.0 in stage 3.0 (TID 595) in 41009 ms on instance-2026043.ipa.ba..com (executor 6) (588/600)
19/03/04 00:10:22 INFO TaskSetManager: Starting task 583.0 in stage 3.0 (TID 601, instance-2026045.ipa.ba..com, executor 1, partition 583, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:22 INFO TaskSetManager: Finished task 568.0 in stage 3.0 (TID 596) in 41692 ms on instance-2026045.ipa.ba..com (executor 1) (589/600)
19/03/04 00:10:29 INFO TaskSetManager: Starting task 584.0 in stage 3.0 (TID 602, instance-2026046.ipa.ba..com, executor 4, partition 584, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:29 INFO TaskSetManager: Finished task 574.0 in stage 3.0 (TID 597) in 39804 ms on instance-2026046.ipa.ba..com (executor 4) (590/600)
19/03/04 00:10:41 INFO TaskSetManager: Starting task 588.0 in stage 3.0 (TID 603, instance-2026038.ipa.ba..com, executor 2, partition 588, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:41 INFO TaskSetManager: Finished task 578.0 in stage 3.0 (TID 598) in 42417 ms on instance-2026038.ipa.ba..com (executor 2) (591/600)
19/03/04 00:10:50 INFO TaskSetManager: Starting task 590.0 in stage 3.0 (TID 604, instance-2026037.ipa.ba..com, executor 5, partition 590, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:50 INFO TaskSetManager: Finished task 579.0 in stage 3.0 (TID 599) in 41431 ms on instance-2026037.ipa.ba..com (executor 5) (592/600)
19/03/04 00:10:59 INFO TaskSetManager: Starting task 591.0 in stage 3.0 (TID 605, instance-2026043.ipa.ba..com, executor 6, partition 591, RACK_LOCAL, 8440 bytes)
19/03/04 00:10:59 INFO TaskSetManager: Finished task 582.0 in stage 3.0 (TID 600) in 42926 ms on instance-2026043.ipa.ba..com (executor 6) (593/600)
19/03/04 00:11:04 INFO TaskSetManager: Starting task 595.0 in stage 3.0 (TID 606, instance-2026045.ipa.ba..com, executor 1, partition 595, RACK_LOCAL, 8440 bytes)
19/03/04 00:11:04 INFO TaskSetManager: Finished task 583.0 in stage 3.0 (TID 601) in 42264 ms on instance-2026045.ipa.ba..com (executor 1) (594/600)
19/03/04 00:11:10 INFO TaskSetManager: Starting task 597.0 in stage 3.0 (TID 607, instance-2026046.ipa.ba..com, executor 4, partition 597, RACK_LOCAL, 8440 bytes)
19/03/04 00:11:10 INFO TaskSetManager: Finished task 584.0 in stage 3.0 (TID 602) in 40587 ms on instance-2026046.ipa.ba..com (executor 4) (595/600)
19/03/04 00:11:25 INFO TaskSetManager: Finished task 588.0 in stage 3.0 (TID 603) in 44005 ms on instance-2026038.ipa.ba..com (executor 2) (596/600)
19/03/04 00:11:32 INFO TaskSetManager: Finished task 590.0 in stage 3.0 (TID 604) in 42027 ms on instance-2026037.ipa.ba..com (executor 5) (597/600)
19/03/04 00:11:42 INFO TaskSetManager: Finished task 591.0 in stage 3.0 (TID 605) in 43176 ms on instance-2026043.ipa.ba..com (executor 6) (598/600)
19/03/04 00:11:47 INFO TaskSetManager: Finished task 595.0 in stage 3.0 (TID 606) in 43036 ms on instance-2026045.ipa.ba..com (executor 1) (599/600)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 597.0 in stage 3.0 (TID 607) in 41030 ms on instance-2026046.ipa.ba..com (executor 4) (600/600)
19/03/04 00:11:51 INFO YarnScheduler: Removed TaskSet 3.0, whose tasks have all completed, from pool
19/03/04 00:11:51 INFO DAGScheduler: ResultStage 3 (collect at PythonRDD.scala:153) finished in 4982.168 s
19/03/04 00:11:51 INFO DAGScheduler: Job 3 finished: collect at PythonRDD.scala:153, took 4982.215393 s
2019-03-04 00:11:51,504 INFO (MainThread-75262) Stopping TensorFlow nodes
19/03/04 00:11:51 INFO SparkContext: Starting job: collect at PythonRDD.scala:153
19/03/04 00:11:51 INFO DAGScheduler: Got job 4 (collect at PythonRDD.scala:153) with 5 output partitions
19/03/04 00:11:51 INFO DAGScheduler: Final stage: ResultStage 4 (collect at PythonRDD.scala:153)
19/03/04 00:11:51 INFO DAGScheduler: Parents of final stage: List()
19/03/04 00:11:51 INFO DAGScheduler: Missing parents: List()
19/03/04 00:11:51 INFO DAGScheduler: Submitting ResultStage 4 (PythonRDD[12] at RDD at PythonRDD.scala:48), which has no missing parents
19/03/04 00:11:51 INFO MemoryStore: Block broadcast_5 stored as values in memory (estimated size 7.4 KB, free 153.4 GB)
19/03/04 00:11:51 INFO MemoryStore: Block broadcast_5_piece0 stored as bytes in memory (estimated size 4.9 KB, free 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026033.ipa.ba..com:38256 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO SparkContext: Created broadcast 5 from broadcast at DAGScheduler.scala:1039
19/03/04 00:11:51 INFO DAGScheduler: Submitting 5 missing tasks from ResultStage 4 (PythonRDD[12] at RDD at PythonRDD.scala:48) (first 15 tasks are for partitions Vector(0, 1, 2, 3, 4))
19/03/04 00:11:51 INFO YarnScheduler: Adding task set 4.0 with 5 tasks
19/03/04 00:11:51 INFO TaskSetManager: Starting task 0.0 in stage 4.0 (TID 608, instance-2026045.ipa.ba..com, executor 1, partition 0, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO TaskSetManager: Starting task 1.0 in stage 4.0 (TID 609, instance-2026037.ipa.ba..com, executor 5, partition 1, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO TaskSetManager: Starting task 2.0 in stage 4.0 (TID 610, instance-2026043.ipa.ba..com, executor 6, partition 2, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO TaskSetManager: Starting task 3.0 in stage 4.0 (TID 611, instance-2026046.ipa.ba..com, executor 4, partition 3, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO TaskSetManager: Starting task 4.0 in stage 4.0 (TID 612, instance-2026038.ipa.ba..com, executor 2, partition 4, PROCESS_LOCAL, 7869 bytes)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026038.ipa.ba..com:45476 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026046.ipa.ba..com:41346 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026043.ipa.ba..com:33242 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026037.ipa.ba..com:45022 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO BlockManagerInfo: Added broadcast_5_piece0 in memory on instance-2026045.ipa.ba..com:45485 (size: 4.9 KB, free: 153.4 GB)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 4.0 in stage 4.0 (TID 612) in 37 ms on instance-2026038.ipa.ba..com (executor 2) (1/5)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 3.0 in stage 4.0 (TID 611) in 40 ms on instance-2026046.ipa.ba..com (executor 4) (2/5)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 2.0 in stage 4.0 (TID 610) in 41 ms on instance-2026043.ipa.ba..com (executor 6) (3/5)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 1.0 in stage 4.0 (TID 609) in 42 ms on instance-2026037.ipa.ba..com (executor 5) (4/5)
19/03/04 00:11:51 INFO TaskSetManager: Finished task 0.0 in stage 4.0 (TID 608) in 44 ms on instance-2026045.ipa.ba..com (executor 1) (5/5)
19/03/04 00:11:51 INFO YarnScheduler: Removed TaskSet 4.0, whose tasks have all completed, from pool
19/03/04 00:11:51 INFO DAGScheduler: ResultStage 4 (collect at PythonRDD.scala:153) finished in 0.052 s
19/03/04 00:11:51 INFO DAGScheduler: Job 4 finished: collect at PythonRDD.scala:153, took 0.054342 s
2019-03-04 00:11:51,584 INFO (MainThread-75262) Shutting down cluster
19/03/04 00:11:54 INFO TaskSetManager: Finished task 0.0 in stage 2.0 (TID 2) in 4988571 ms on instance-2026039.ipa.ba..com (executor 3) (6/6)
19/03/04 00:11:54 INFO YarnScheduler: Removed TaskSet 2.0, whose tasks have all completed, from pool
19/03/04 00:11:54 INFO DAGScheduler: ResultStage 2 (foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301) finished in 4988.578 s
19/03/04 00:11:54 INFO DAGScheduler: Job 2 finished: foreachPartition at /usr/lib/python2.7/site-packages/tensorflowonspark/TFCluster.py:301, took 4988.581628 s
19/03/04 00:11:57 INFO SparkContext: Invoking stop() from shutdown hook
19/03/04 00:11:57 INFO SparkUI: Stopped Spark web UI at http://instance-2026033.ipa.ba..com:4041
19/03/04 00:11:57 INFO YarnClientSchedulerBackend: Interrupting monitor thread
19/03/04 00:11:58 INFO YarnClientSchedulerBackend: Shutting down all executors
19/03/04 00:11:58 INFO YarnSchedulerBackend$YarnDriverEndpoint: Asking each executor to shut down
19/03/04 00:11:58 INFO SchedulerExtensionServices: Stopping SchedulerExtensionServices
(serviceOption=None,
services=List(),
started=false)
19/03/04 00:11:58 INFO YarnClientSchedulerBackend: Stopped
19/03/04 00:11:58 INFO MapOutputTrackerMasterEndpoint: MapOutputTrackerMasterEndpoint stopped!
19/03/04 00:11:58 INFO MemoryStore: MemoryStore cleared
19/03/04 00:11:58 INFO BlockManager: BlockManager stopped
19/03/04 00:11:58 INFO BlockManagerMaster: BlockManagerMaster stopped
19/03/04 00:11:58 INFO OutputCommitCoordinator$OutputCommitCoordinatorEndpoint: OutputCommitCoordinator stopped!
19/03/04 00:11:58 INFO SparkContext: Successfully stopped SparkContext
19/03/04 00:11:58 INFO ShutdownHookManager: Shutdown hook called
19/03/04 00:11:58 INFO ShutdownHookManager: Deleting directory /tmp/spark-2c81098d-4f91-4a1d-87ea-71ecf1c72204
19/03/04 00:11:58 INFO ShutdownHookManager: Deleting directory /tmp/spark-2c81098d-4f91-4a1d-87ea-71ecf1c72204/pyspark-6a95ff4e-93cf-4c31-ac1a-002432e73cd1
19/03/04 00:11:58 INFO ShutdownHookManager: Deleting directory /tmp/spark-3d2e5cc9-f5d1-45a4-a387-50197a773614
[Surya@..com@instance-2026033 ~]$
Metadata
Metadata
Assignees
Labels
No labels