Skip to content

Commit

Permalink
Remove lazy parameters for GetRowCount (#1621)
Browse files Browse the repository at this point in the history
* Remove lazy parameters for GetRowCount

* Address comments
  • Loading branch information
wschin authored Nov 15, 2018
1 parent 057c4b9 commit 8a45f37
Show file tree
Hide file tree
Showing 37 changed files with 69 additions and 76 deletions.
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Api/DataViewConstructionUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -397,7 +397,7 @@ protected DataViewBase(IHostEnvironment env, string name, InternalSchemaDefiniti
}
}

public abstract long? GetRowCount(bool lazy = true);
public abstract long? GetRowCount();

public abstract IRowCursor GetRowCursor(Func<int, bool> predicate, IRandom rand = null);

Expand Down Expand Up @@ -555,7 +555,7 @@ public override bool CanShuffle
get { return true; }
}

public override long? GetRowCount(bool lazy = true)
public override long? GetRowCount()
{
return _data.Count;
}
Expand Down Expand Up @@ -654,7 +654,7 @@ public override bool CanShuffle
get { return false; }
}

public override long? GetRowCount(bool lazy = true)
public override long? GetRowCount()
{
return (_data as ICollection<TRow>)?.Count;
}
Expand Down Expand Up @@ -735,7 +735,7 @@ public override bool CanShuffle
get { return false; }
}

public override long? GetRowCount(bool lazy = true)
public override long? GetRowCount()
{
return null;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Api/StatefulFilterTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -99,7 +99,7 @@ private StatefulFilterTransform(IHostEnvironment env, StatefulFilterTransform<TS

public Schema Schema => _bindings.Schema;

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
// REVIEW: currently stateful map is implemented via filter, and this is sub-optimal.
return null;
Expand Down
16 changes: 7 additions & 9 deletions src/Microsoft.ML.Core/Data/IDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -82,17 +82,15 @@ public interface IDataView : ISchematized
bool CanShuffle { get; }

/// <summary>
/// Returns the number of rows if known. Null means unknown. If lazy is true, then
/// this is permitted to return null when it might return a non-null value on a subsequent
/// call. This indicates, that the transform does not YET know the number of rows, but
/// may in the future. If lazy is false, then this is permitted to do some work (no more
/// that it would normally do for cursoring) to determine the number of rows.
/// Returns the number of rows if known. Returning null means that the row count is unknown but
/// it might return a non-null value on a subsequent call. This indicates, that the transform does
/// not YET know the number of rows, but may in the future. Its implementation's computation
/// complexity should be O(1).
///
/// Most components will return the same answer whether lazy is true or false. Some, like
/// a cache, might return null until the cache is fully populated (when lazy is true). When
/// lazy is false, such a cache would block until the cache was populated.
/// Most implementation will return the same answer every time. Some, like a cache, might
/// return null until the cache is fully populated.
/// </summary>
long? GetRowCount(bool lazy = true);
long? GetRowCount();

/// <summary>
/// Get a row cursor. The active column indices are those for which needCol(col) returns true.
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Data/DataViewUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ public static string[] GetTempColumnNames(this ISchema schema, int n, string tag
/// </summary>
public static long ComputeRowCount(IDataView view)
{
long? countNullable = view.GetRowCount(lazy: false);
long? countNullable = view.GetRowCount();
if (countNullable != null)
return countNullable.Value;
long count = 0;
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Data/RowCursorUtils.cs
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,7 @@ public IRowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolidator, Fun
return new IRowCursor[] { GetRowCursor(needCol, rand) };
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return 1;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/DataLoadSave/Binary/BinaryLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -761,7 +761,7 @@ public void GetMetadata<TValue>(string kind, int col, ref TValue value)

private long RowCount { get { return _header.RowCount; } }

public long? GetRowCount(bool lazy = true) { return RowCount; }
public long? GetRowCount() { return RowCount; }

public bool CanShuffle { get { return true; } }

Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/DataLoadSave/CompositeDataLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,9 @@ private static string GenerateTag(int index)
return string.Format("xf{0:00}", index);
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return View.GetRowCount(lazy);
return View.GetRowCount();
}

public bool CanShuffle => View.CanShuffle;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -287,7 +287,7 @@ public void Save(ModelSaveContext ctx)

public Schema Schema { get; }

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return null;
}
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/DataLoadSave/Text/TextLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1352,7 +1352,7 @@ public BoundLoader(TextLoader reader, IMultiStreamSource files)
_files = files;
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
// We don't know how many rows there are.
// REVIEW: Should we try to support RowCount?
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/DataLoadSave/Text/TextSaver.cs
Original file line number Diff line number Diff line change
Expand Up @@ -420,7 +420,7 @@ private void WriteDataCore(IChannel ch, TextWriter writer, IDataView data,
if (_outputSchema)
WriteSchemaAsComment(writer, header);

double rowCount = data.GetRowCount(true) ?? double.NaN;
double rowCount = data.GetRowCount() ?? double.NaN;
using (var pch = !_silent ? _host.StartProgressChannel("TextSaver: saving data") : null)
{
long stateCount = 0;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -662,7 +662,7 @@ public VectorType GetSlotType(int col)
}
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return _header.RowCount;
}
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.Data/DataView/AppendRowsDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ private AppendRowsDataView(IHostEnvironment env, Schema schema, IDataView[] sour
_counts = null;
break;
}
long? count = dv.GetRowCount(true);
long? count = dv.GetRowCount();
if (count == null || count < 0 || count > int.MaxValue)
{
_canShuffle = false;
Expand Down Expand Up @@ -127,12 +127,12 @@ private void CheckSchemaConsistency()
}
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
long sum = 0;
foreach (var source in _sources)
{
var cur = source.GetRowCount(lazy);
var cur = source.GetRowCount();
if (cur == null)
return null;
_host.Check(cur.Value >= 0, "One of the sources returned a negative row count");
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/DataView/ArrayDataViewBuilder.cs
Original file line number Diff line number Diff line change
Expand Up @@ -197,7 +197,7 @@ private sealed class DataView : IDataView

public Schema Schema { get { return _schema; } }

public long? GetRowCount(bool lazy = true) { return _rowCount; }
public long? GetRowCount() { return _rowCount; }

public bool CanShuffle { get { return true; } }

Expand Down
17 changes: 6 additions & 11 deletions src/Microsoft.ML.Data/DataView/CacheDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -193,18 +193,13 @@ public int MapInputToCacheColumnIndex(int inputIndex)

public Schema Schema => _subsetInput.Schema;

public long? GetRowCount(bool lazy = true)
/// <summary>
/// Return the number of rows if available.
/// </summary>
public long? GetRowCount()
{
if (_rowCount < 0)
{
if (lazy)
return null;
if (_cacheDefaultWaiter == null)
KickoffFiller(new int[0]);
_host.Assert(_cacheDefaultWaiter != null);
_cacheDefaultWaiter.Wait(long.MaxValue);
_host.Assert(_rowCount >= 0);
}
return null;
return _rowCount;
}

Expand Down Expand Up @@ -317,7 +312,7 @@ public IRowSeeker GetSeeker(Func<int, bool> predicate)
_host.CheckValue(predicate, nameof(predicate));
// The seeker needs to know the row count when it validates the row index to move to.
// Calling GetRowCount here to force a wait indirectly so that _rowCount will have a valid value.
GetRowCount(false);
GetRowCount();
_host.Assert(_rowCount >= 0);
var waiter = WaiterWaiter.Create(this, predicate);
if (waiter.IsTrivial)
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/DataView/EmptyDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ public EmptyDataView(IHostEnvironment env, Schema schema)
Schema = schema;
}

public long? GetRowCount(bool lazy = true) => 0;
public long? GetRowCount() => 0;

public IRowCursor GetRowCursor(Func<int, bool> needCol, IRandom rand = null)
{
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/DataView/OpaqueDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,9 @@ public OpaqueDataView(IDataView source)
_source = source;
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return _source.GetRowCount(lazy);
return _source.GetRowCount();
}

public IRowCursor GetRowCursor(Func<int, bool> predicate, IRandom rand = null)
Expand Down
8 changes: 4 additions & 4 deletions src/Microsoft.ML.Data/DataView/Transposer.cs
Original file line number Diff line number Diff line change
Expand Up @@ -274,7 +274,7 @@ public IRowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolidator, Fun
return _view.GetRowCursorSet(out consolidator, predicate, n, rand);
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
// Not a passthrough.
return RowCount;
Expand Down Expand Up @@ -818,9 +818,9 @@ public DataViewSlicer(IHost host, IDataView input, int[] toSlice)
_schema = new SchemaImpl(this, nameToCol);
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return _input.GetRowCount(lazy);
return _input.GetRowCount();
}

/// <summary>
Expand Down Expand Up @@ -1503,7 +1503,7 @@ public SlotDataView(IHostEnvironment env, ITransposeDataView data, int col)
_schemaImpl = new SchemaImpl(this);
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
var type = _data.Schema.GetColumnType(_col);
int valueCount = type.ValueCount;
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/DataView/ZipDataView.cs
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,12 @@ private ZipDataView(IHost host, IDataView[] sources)

public Schema Schema => _compositeSchema.AsSchema;

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
long min = -1;
foreach (var source in _sources)
{
var cur = source.GetRowCount(lazy);
var cur = source.GetRowCount();
if (cur == null)
return null;
_host.Check(cur.Value >= 0, "One of the sources returned a negative row count");
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Evaluators/RankerEvaluator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -635,9 +635,9 @@ public void Save(ModelSaveContext ctx)
_transform.Save(ctx);
}

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return _transform.GetRowCount(lazy);
return _transform.GetRowCount();
}

public IRowCursor GetRowCursor(Func<int, bool> needCol, IRandom rand = null)
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Transforms/NopTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,9 @@ public bool CanShuffle

public Schema Schema => Source.Schema;

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return Source.GetRowCount(lazy);
return Source.GetRowCount();
}

public IRowCursor GetRowCursor(Func<int, bool> predicate, IRandom rand = null)
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Transforms/PerGroupTransformBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -144,9 +144,9 @@ public virtual void Save(ModelSaveContext ctx)

protected abstract BindingsBase GetBindings();

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return Source.GetRowCount(lazy);
return Source.GetRowCount();
}

public IRowCursor[] GetRowCursorSet(out IRowCursorConsolidator consolidator, Func<int, bool> predicate, int n, IRandom rand = null)
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Data/Transforms/SelectColumnsTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -628,7 +628,7 @@ public SelectColumnsDataTransform(IHostEnvironment env, SelectColumnsTransform t

Schema ISchematized.Schema => _mapper.Schema;

public long? GetRowCount(bool lazy = true) => Source.GetRowCount(lazy);
public long? GetRowCount() => Source.GetRowCount();

public IRowCursor GetRowCursor(Func<int, bool> needCol, IRandom rand = null)
{
Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Transforms/SkipTakeFilter.cs
Original file line number Diff line number Diff line change
Expand Up @@ -169,11 +169,11 @@ public override void Save(ModelSaveContext ctx)
/// Returns the computed count of rows remaining after skip and take operation.
/// Returns null if count is unknown.
/// </summary>
public override long? GetRowCount(bool lazy = true)
public override long? GetRowCount()
{
if (_take == 0)
return 0;
long? count = Source.GetRowCount(lazy);
long? count = Source.GetRowCount();
if (count == null)
return null;

Expand Down
4 changes: 2 additions & 2 deletions src/Microsoft.ML.Data/Transforms/TermTransform.cs
Original file line number Diff line number Diff line change
Expand Up @@ -507,7 +507,7 @@ private static TermMap CreateFileTermMap(IHostEnvironment env, IChannel ch, stri
{
var header = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" });
var trainer = Trainer.Create(cursor, colSrc, autoConvert, int.MaxValue, bldr);
double rowCount = termData.GetRowCount(true) ?? double.NaN;
double rowCount = termData.GetRowCount() ?? double.NaN;
long rowCur = 0;
pch.SetHeader(header,
e =>
Expand Down Expand Up @@ -606,7 +606,7 @@ private static TermMap[] Train(IHostEnvironment env, IChannel ch, ColInfo[] info
using (var pch = env.StartProgressChannel("Building term dictionary"))
{
long rowCur = 0;
double rowCount = trainingData.GetRowCount(true) ?? double.NaN;
double rowCount = trainingData.GetRowCount() ?? double.NaN;
var header = new ProgressHeader(new[] { "Total Terms" }, new[] { "examples" });

itrainer = 0;
Expand Down
6 changes: 3 additions & 3 deletions src/Microsoft.ML.Data/Transforms/TransformBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@ protected TransformBase(IHost host, IDataView input)

public abstract void Save(ModelSaveContext ctx);

public abstract long? GetRowCount(bool lazy = true);
public abstract long? GetRowCount();

public virtual bool CanShuffle { get { return Source.CanShuffle; } }

Expand Down Expand Up @@ -104,7 +104,7 @@ protected RowToRowTransformBase(IHost host, IDataView input)
{
}

public sealed override long? GetRowCount(bool lazy = true) { return Source.GetRowCount(lazy); }
public sealed override long? GetRowCount() { return Source.GetRowCount(); }
}

/// <summary>
Expand All @@ -124,7 +124,7 @@ private protected FilterBase(IHost host, IDataView input)
{
}

public override long? GetRowCount(bool lazy = true) => null;
public override long? GetRowCount() => null;

public sealed override Schema Schema => Source.Schema;

Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.FastTree/FastTree.cs
Original file line number Diff line number Diff line change
Expand Up @@ -1862,7 +1862,7 @@ private void MakeBoundariesAndCheckLabels(out long missingInstances, out long to
ch.Info("Changing data from row-wise to column-wise");

long pos = 0;
double rowCountDbl = (double?)_data.Data.GetRowCount(lazy: true) ?? Double.NaN;
double rowCountDbl = (double?)_data.Data.GetRowCount() ?? Double.NaN;
pch.SetHeader(new ProgressHeader("examples"),
e => e.SetProgress(0, pos, rowCountDbl));
// REVIEW: Should we ignore rows with bad label, weight, or group? The previous code seemed to let
Expand Down
2 changes: 1 addition & 1 deletion src/Microsoft.ML.Parquet/ParquetLoader.cs
Original file line number Diff line number Diff line change
Expand Up @@ -384,7 +384,7 @@ private static Stream OpenStream(string filename)

public Schema Schema { get; }

public long? GetRowCount(bool lazy = true)
public long? GetRowCount()
{
return _rowCount;
}
Expand Down
Loading

0 comments on commit 8a45f37

Please sign in to comment.