Skip to content

Commit

Permalink
fix tensorflow test hanging issue (#4997)
Browse files Browse the repository at this point in the history
* fix tensorflow test hanging issue

* set smaller timeout for download resource and ingore exception within retry

* take comments

* only override timeout variable for tensorflow tests
  • Loading branch information
frank-dong-ms-zz committed Apr 4, 2020
1 parent 26918a4 commit f94f359
Show file tree
Hide file tree
Showing 3 changed files with 78 additions and 80 deletions.
39 changes: 29 additions & 10 deletions src/Microsoft.ML.Core/Utilities/ResourceManagerUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -127,14 +127,22 @@ private async Task<string> DownloadFromUrlWithRetryAsync(IHostEnvironment env, I

for (int i = 0; i < retryTimes; ++i)
{
var thisDownloadResult = await DownloadFromUrlAsync(env, ch, url, fileName, timeout, filePath);
try
{
var thisDownloadResult = await DownloadFromUrlAsync(env, ch, url, fileName, timeout, filePath);

if (string.IsNullOrEmpty(thisDownloadResult))
return thisDownloadResult;
else
downloadResult += thisDownloadResult + @"\n";
if (string.IsNullOrEmpty(thisDownloadResult))
return thisDownloadResult;
else
downloadResult += thisDownloadResult + @"\n";

await Task.Delay(10 * 1000);
await Task.Delay(10 * 1000);
}
catch (Exception ex)
{
// ignore any Exception and retrying download
ch.Warning($"{i+1} - th try: Dowload {fileName} from {url} fail with exception {ex.Message}");
}
}

return downloadResult;
Expand Down Expand Up @@ -257,6 +265,8 @@ private Exception DownloadResource(IHostEnvironment env, IChannel ch, WebClient
string tempPath = Path.GetFullPath(Path.Combine(Path.GetDirectoryName(path), "temp-resource-" + guid.ToString()));
try
{
int blockSize = 4096;

using (var s = webClient.OpenRead(uri))
using (var fh = env.CreateOutputFile(tempPath))
using (var ws = fh.CreateWriteStream())
Expand All @@ -268,15 +278,24 @@ private Exception DownloadResource(IHostEnvironment env, IChannel ch, WebClient
size = 10000000;

long printFreq = (long)(size / 10.0);
var buffer = new byte[4096];
var buffer = new byte[blockSize];
long total = 0;
int count;

// REVIEW: use a progress channel instead.
while ((count = s.Read(buffer, 0, 4096)) > 0)
while (true)
{
var task = s.ReadAsync(buffer, 0, blockSize, ct);
task.Wait();
int count = task.Result;

if(count <= 0)
{
break;
}

ws.Write(buffer, 0, count);
total += count;
if ((total - (total / printFreq) * printFreq) <= 4096)
if ((total - (total / printFreq) * printFreq) <= blockSize)
ch.Info($"{fileName}: Downloaded {total} bytes out of {size}");
if (ct.IsCancellationRequested)
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,6 @@ public TestResourceDownload(ITestOutputHelper helper)
public async Task TestDownloadError()
{
var envVarOld = Environment.GetEnvironmentVariable(ResourceManagerUtils.CustomResourcesUrlEnvVariable);
var timeoutVarOld = Environment.GetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable);
var resourcePathVarOld = Environment.GetEnvironmentVariable(Utils.CustomSearchDirEnvVariable);
Environment.SetEnvironmentVariable(Utils.CustomSearchDirEnvVariable, null);

Expand Down Expand Up @@ -134,7 +133,6 @@ public async Task TestDownloadError()
{
// Set environment variable back to its old value.
Environment.SetEnvironmentVariable(ResourceManagerUtils.CustomResourcesUrlEnvVariable, envVarOld);
Environment.SetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable, timeoutVarOld);
Environment.SetEnvironmentVariable(Utils.CustomSearchDirEnvVariable, resourcePathVarOld);
}
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
using static Microsoft.ML.DataOperationsCatalog;
using Microsoft.ML.Trainers;
using Microsoft.ML.TestFrameworkCommon.Attributes;
using Microsoft.ML.Internal.Utilities;

namespace Microsoft.ML.Scenarios
{
Expand Down Expand Up @@ -61,8 +62,34 @@ public void Dispose()
[Collection("NoParallelization")]
public sealed class TensorFlowScenariosTests : BaseTestClass, IClassFixture<TensorFlowScenariosTestsFixture>
{
private readonly string _fullImagesetFolderPath;
private readonly string _finalImagesFolderName;
private string _timeOutOldValue;

public TensorFlowScenariosTests(ITestOutputHelper output) : base(output)
{
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
"images");

//Download the image set and unzip
_finalImagesFolderName = DownloadImageSet(
imagesDownloadFolderPath);

_fullImagesetFolderPath = Path.Combine(
imagesDownloadFolderPath, _finalImagesFolderName);
}

protected override void Initialize()
{
// set timeout to 3 minutes, download sometimes will stuck so set smaller timeout to fail fast and retry download
_timeOutOldValue = Environment.GetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable);
Environment.SetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable, (3 * 60 * 1000).ToString());
}

protected override void Cleanup()
{
// set back timeout value
Environment.SetEnvironmentVariable(ResourceManagerUtils.TimeoutEnvVariable, _timeOutOldValue);
}

private class TestData
Expand Down Expand Up @@ -1250,25 +1277,13 @@ public void TensorFlowStringTest()
}

[TensorFlowFact]
// This test hangs occasionally
[Trait("Category", "SkipInCI")]
public void TensorFlowImageClassificationDefault()
{
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
"images");

//Download the image set and unzip
string finalImagesFolderName = DownloadImageSet(
imagesDownloadFolderPath);

string fullImagesetFolderPath = Path.Combine(
imagesDownloadFolderPath, finalImagesFolderName);

MLContext mlContext = new MLContext(seed: 1);

//Load all the original images info
IEnumerable<ImageData> images = LoadImagesFromDirectory(
folder: fullImagesetFolderPath, useFolderNameAsLabel: true);
folder: _fullImagesetFolderPath, useFolderNameAsLabel: true);

IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows(
mlContext.Data.LoadFromEnumerable(images), seed: 1);
Expand All @@ -1285,7 +1300,7 @@ public void TensorFlowImageClassificationDefault()
IDataView trainDataset = trainTestData.TrainSet;
IDataView testDataset = trainTestData.TestSet;

var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
.Append(mlContext.MulticlassClassification.Trainers.ImageClassification("Label", "Image")
.Append(mlContext.Transforms.Conversion.MapKeyToValue(outputColumnName: "PredictedLabel", inputColumnName: "PredictedLabel"))); ;

Expand Down Expand Up @@ -1338,25 +1353,13 @@ internal bool ShouldReuse(string workspacePath, string trainSetBottleneckCachedV
[InlineData(ImageClassificationTrainer.Architecture.MobilenetV2)]
[InlineData(ImageClassificationTrainer.Architecture.ResnetV250)]
[InlineData(ImageClassificationTrainer.Architecture.InceptionV3)]
//Skipping test temporarily. This test will be re-enabled once the cause of failures has been determined
[Trait("Category", "SkipInCI")]
public void TensorFlowImageClassification(ImageClassificationTrainer.Architecture arch)
{
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
"images");

//Download the image set and unzip
string finalImagesFolderName = DownloadImageSet(
imagesDownloadFolderPath);

string fullImagesetFolderPath = Path.Combine(
imagesDownloadFolderPath, finalImagesFolderName);

MLContext mlContext = new MLContext(seed: 1);

//Load all the original images info
IEnumerable<ImageData> images = LoadImagesFromDirectory(
folder: fullImagesetFolderPath, useFolderNameAsLabel: true);
folder: _fullImagesetFolderPath, useFolderNameAsLabel: true);

IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows(
mlContext.Data.LoadFromEnumerable(images), seed: 1);
Expand All @@ -1372,13 +1375,13 @@ public void TensorFlowImageClassification(ImageClassificationTrainer.Architectur

IDataView trainDataset = trainTestData.TrainSet;
IDataView testDataset = trainTestData.TestSet;
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
.Fit(testDataset)
.Transform(testDataset);

// Check if the bottleneck cached values already exist
var (trainSetBottleneckCachedValuesFileName, validationSetBottleneckCachedValuesFileName,
workspacePath, isReuse) = getInitialParameters(arch, finalImagesFolderName);
workspacePath, isReuse) = getInitialParameters(arch, _finalImagesFolderName);

var options = new ImageClassificationTrainer.Options()
{
Expand All @@ -1401,7 +1404,7 @@ public void TensorFlowImageClassification(ImageClassificationTrainer.Architectur
ValidationSet = validationSet
};

var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
.Append(mlContext.MulticlassClassification.Trainers.ImageClassification(options)
.Append(mlContext.Transforms.Conversion.MapKeyToValue(outputColumnName: "PredictedLabel", inputColumnName: "PredictedLabel")));

Expand Down Expand Up @@ -1429,9 +1432,9 @@ public void TensorFlowImageClassification(ImageClassificationTrainer.Architectur
.CreatePredictionEngine<ImageData, ImagePrediction>(loadedModel);

IEnumerable<ImageData> testImages = LoadImagesFromDirectory(
fullImagesetFolderPath, true);
_fullImagesetFolderPath, true);

string[] directories = Directory.GetDirectories(fullImagesetFolderPath);
string[] directories = Directory.GetDirectories(_fullImagesetFolderPath);
string[] labels = new string[directories.Length];
for (int j = 0; j < labels.Length; j++)
{
Expand All @@ -1442,13 +1445,13 @@ public void TensorFlowImageClassification(ImageClassificationTrainer.Architectur
// Test daisy image
ImageData firstImageToPredict = new ImageData
{
ImagePath = Path.Combine(fullImagesetFolderPath, "daisy", "5794835_d15905c7c8_n.jpg")
ImagePath = Path.Combine(_fullImagesetFolderPath, "daisy", "5794835_d15905c7c8_n.jpg")
};

// Test rose image
ImageData secondImageToPredict = new ImageData
{
ImagePath = Path.Combine(fullImagesetFolderPath, "roses", "12240303_80d87f77a3_n.jpg")
ImagePath = Path.Combine(_fullImagesetFolderPath, "roses", "12240303_80d87f77a3_n.jpg")
};

var predictionFirst = predictionEngine.Predict(firstImageToPredict);
Expand Down Expand Up @@ -1486,21 +1489,11 @@ public void TensorFlowImageClassificationWithPolynomialLRScheduling()

internal void TensorFlowImageClassificationWithLRScheduling(LearningRateScheduler learningRateScheduler, int epoch)
{
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
"images");

//Download the image set and unzip
string finalImagesFolderName = DownloadImageSet(
imagesDownloadFolderPath);

string fullImagesetFolderPath = Path.Combine(
imagesDownloadFolderPath, finalImagesFolderName);

MLContext mlContext = new MLContext(seed: 1);

//Load all the original images info
IEnumerable<ImageData> images = LoadImagesFromDirectory(
folder: fullImagesetFolderPath, useFolderNameAsLabel: true);
folder: _fullImagesetFolderPath, useFolderNameAsLabel: true);

IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows(
mlContext.Data.LoadFromEnumerable(images), seed: 1);
Expand All @@ -1516,13 +1509,13 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule

IDataView trainDataset = trainTestData.TrainSet;
IDataView testDataset = trainTestData.TestSet;
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
.Fit(testDataset)
.Transform(testDataset);

// Check if the bottleneck cached values already exist
var (trainSetBottleneckCachedValuesFileName, validationSetBottleneckCachedValuesFileName,
workspacePath, isReuse) = getInitialParameters(ImageClassificationTrainer.Architecture.ResnetV2101, finalImagesFolderName);
workspacePath, isReuse) = getInitialParameters(ImageClassificationTrainer.Architecture.ResnetV2101, _finalImagesFolderName);

var options = new ImageClassificationTrainer.Options()
{
Expand All @@ -1546,7 +1539,7 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
LearningRateScheduler = learningRateScheduler
};

var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
.Append(mlContext.MulticlassClassification.Trainers.ImageClassification(options))
.Append(mlContext.Transforms.Conversion.MapKeyToValue(
outputColumnName: "PredictedLabel",
Expand Down Expand Up @@ -1575,9 +1568,9 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
.CreatePredictionEngine<ImageData, ImagePrediction>(loadedModel);

IEnumerable<ImageData> testImages = LoadImagesFromDirectory(
fullImagesetFolderPath, true);
_fullImagesetFolderPath, true);

string[] directories = Directory.GetDirectories(fullImagesetFolderPath);
string[] directories = Directory.GetDirectories(_fullImagesetFolderPath);
string[] labels = new string[directories.Length];
for (int j = 0; j < labels.Length; j++)
{
Expand All @@ -1588,13 +1581,13 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
// Test daisy image
ImageData firstImageToPredict = new ImageData
{
ImagePath = Path.Combine(fullImagesetFolderPath, "daisy", "5794835_d15905c7c8_n.jpg")
ImagePath = Path.Combine(_fullImagesetFolderPath, "daisy", "5794835_d15905c7c8_n.jpg")
};

// Test rose image
ImageData secondImageToPredict = new ImageData
{
ImagePath = Path.Combine(fullImagesetFolderPath, "roses", "12240303_80d87f77a3_n.jpg")
ImagePath = Path.Combine(_fullImagesetFolderPath, "roses", "12240303_80d87f77a3_n.jpg")
};

var predictionFirst = predictionEngine.Predict(firstImageToPredict);
Expand Down Expand Up @@ -1624,25 +1617,13 @@ internal void TensorFlowImageClassificationWithLRScheduling(LearningRateSchedule
[TensorFlowTheory]
[InlineData(ImageClassificationTrainer.EarlyStoppingMetric.Accuracy)]
[InlineData(ImageClassificationTrainer.EarlyStoppingMetric.Loss)]
// This test hangs ocassionally
[Trait("Category", "SkipInCI")]
public void TensorFlowImageClassificationEarlyStopping(ImageClassificationTrainer.EarlyStoppingMetric earlyStoppingMetric)
{
string imagesDownloadFolderPath = Path.Combine(TensorFlowScenariosTestsFixture.assetsPath, "inputs",
"images");

//Download the image set and unzip
string finalImagesFolderName = DownloadImageSet(
imagesDownloadFolderPath);

string fullImagesetFolderPath = Path.Combine(
imagesDownloadFolderPath, finalImagesFolderName);

MLContext mlContext = new MLContext(seed: 1);

//Load all the original images info
IEnumerable<ImageData> images = LoadImagesFromDirectory(
folder: fullImagesetFolderPath, useFolderNameAsLabel: true);
folder: _fullImagesetFolderPath, useFolderNameAsLabel: true);

IDataView shuffledFullImagesDataset = mlContext.Data.ShuffleRows(
mlContext.Data.LoadFromEnumerable(images), seed: 1);
Expand All @@ -1660,13 +1641,13 @@ public void TensorFlowImageClassificationEarlyStopping(ImageClassificationTraine
IDataView testDataset = trainTestData.TestSet;

int lastEpoch = 0;
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
var validationSet = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
.Fit(testDataset)
.Transform(testDataset);

// Check if the bottleneck cached values already exist
var (trainSetBottleneckCachedValuesFileName, validationSetBottleneckCachedValuesFileName,
workspacePath, isReuse) = getInitialParameters(ImageClassificationTrainer.Architecture.ResnetV2101, finalImagesFolderName);
workspacePath, isReuse) = getInitialParameters(ImageClassificationTrainer.Architecture.ResnetV2101, _finalImagesFolderName);



Expand All @@ -1692,7 +1673,7 @@ public void TensorFlowImageClassificationEarlyStopping(ImageClassificationTraine
ValidationSet = validationSet
};

var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", fullImagesetFolderPath, "ImagePath")
var pipeline = mlContext.Transforms.LoadRawImageBytes("Image", _fullImagesetFolderPath, "ImagePath")
.Append(mlContext.MulticlassClassification.Trainers.ImageClassification(options));

using var trainedModel = pipeline.Fit(trainDataset);
Expand Down

0 comments on commit f94f359

Please sign in to comment.