Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

use dict to cache eltype names #2750

Merged
merged 8 commits into from
May 7, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
95 changes: 92 additions & 3 deletions src/abstractdataframe/io.jl
Original file line number Diff line number Diff line change
@@ -1,3 +1,91 @@
"""
DataFrames.getmaxwidths(df::AbstractDataFrame,
io::IO,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
rowlabel::Symbol,
rowid::Union{Integer, Nothing},
show_eltype::Bool,
buffer::IOBuffer)

Calculate, for each column of an AbstractDataFrame, the maximum
string width used to render the name of that column, its type, and the
longest entry in that column -- among the rows of the data frame
will be rendered to IO. The widths for all columns are returned as a
vector.

Return a `Vector{Int}` giving the maximum string widths required to render
each column, including that column's name and type.

NOTE: The last entry of the result vector is the string width of the
implicit row ID column contained in every `AbstractDataFrame`.

# Arguments
- `df::AbstractDataFrame`: The data frame whose columns will be printed.
- `io::IO`: The `IO` to which `df` is to be printed
- `rowindices1::AbstractVector{Int}: A set of indices of the first
chunk of the AbstractDataFrame that would be rendered to IO.
- `rowindices2::AbstractVector{Int}: A set of indices of the second
chunk of the AbstractDataFrame that would be rendered to IO. Can
be empty if the AbstractDataFrame would be printed without any
ellipses.
- `rowlabel::AbstractString`: The label that will be used when rendered the
numeric ID's of each row. Typically, this will be set to "Row".
- `rowid`: Used to handle showing `DataFrameRow`.
- `show_eltype`: Whether to print the column type
under the column name in the heading.
- `buffer`: buffer passed around to avoid reallocations in `ourstrwidth`
"""
function getmaxwidths(df::AbstractDataFrame,
io::IO,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
rowlabel::Symbol,
rowid::Union{Integer, Nothing},
show_eltype::Bool,
buffer::IOBuffer,
truncstring::Int)
maxwidths = Vector{Int}(undef, size(df, 2) + 1)

undefstrwidth = ourstrwidth(io, "#undef", buffer, truncstring)

ct = show_eltype ? batch_compacttype(Any[eltype(c) for c in eachcol(df)]) : String[]
j = 1
for (col_idx, (name, col)) in enumerate(pairs(eachcol(df)))
# (1) Consider length of column name
# do not truncate column name
maxwidth = ourstrwidth(io, name, buffer, 0)

# (2) Consider length of longest entry in that column
for indices in (rowindices1, rowindices2), i in indices
if isassigned(col, i)
maxwidth = max(maxwidth, ourstrwidth(io, col[i], buffer, truncstring))
else
maxwidth = max(maxwidth, undefstrwidth)
end
end
if show_eltype
# do not truncate eltype name
maxwidths[j] = max(maxwidth, ourstrwidth(io, ct[col_idx], buffer, 0))
else
maxwidths[j] = maxwidth
end
j += 1
end

# do not truncate rowlabel
if rowid isa Nothing
rowmaxwidth1 = isempty(rowindices1) ? 0 : ndigits(maximum(rowindices1))
rowmaxwidth2 = isempty(rowindices2) ? 0 : ndigits(maximum(rowindices2))
maxwidths[j] = max(max(rowmaxwidth1, rowmaxwidth2),
ourstrwidth(io, rowlabel, buffer, 0))
else
maxwidths[j] = max(ndigits(rowid), ourstrwidth(io, rowlabel, buffer, 0))
end

return maxwidths
end

"""
show(io::IO, mime::MIME, df::AbstractDataFrame)

Expand Down Expand Up @@ -107,8 +195,9 @@ function _show(io::IO, ::MIME"text/html", df::AbstractDataFrame;
if eltypes
write(io, "<tr>")
write(io, "<th></th>")
ct = batch_compacttype(Any[eltype(df[!, idx]) for idx in 1:mxcol])
for j in 1:mxcol
s = html_escape(compacttype(eltype(df[!, j])))
s = html_escape(ct[j])
write(io, "<th>$s</th>")
end
write(io, "</tr>")
Expand Down Expand Up @@ -281,8 +370,8 @@ function _show(io::IO, ::MIME"text/latex", df::AbstractDataFrame;
write(io, "\t\\hline\n")
if eltypes
write(io, "\t& ")
header = join(map(c -> latex_escape(string(compacttype(c))),
eltype.(eachcol(df)[1:mxcol])), " & ")
ct = batch_compacttype(Any[eltype(df[!, idx]) for idx in 1:mxcol])
header = join(latex_escape.(ct), " & ")
write(io, header)
mxcol < size(df, 2) && write(io, " & ")
write(io, "\\\\\n")
Expand Down
136 changes: 38 additions & 98 deletions src/abstractdataframe/show.jl
Original file line number Diff line number Diff line change
Expand Up @@ -68,8 +68,39 @@ if VERSION < v"1.5.0-DEV.261" || VERSION < v"1.5.0-DEV.266"
end
end

"""Return compact string representation of type T"""
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Add a comment to explain what's the point of having this function?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

added

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't see it. :-D

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

ah - added. I was looking at another view on GitHub and thought you want me to expand the docstring of compacteltype (which I did)

function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true)
# For most data frames, especially wide, columns having the same element type
# occur multiple times. batch_compacttype ensures that we compute string
# representation of a specific column element type only once and then reuse it.

function batch_compacttype(types::Vector{Any}, maxwidths::Vector{Int})
@assert length(types) == length(maxwidths)
cache = Dict{Any, String}()
return map(types, maxwidths) do T, maxwidth
get!(cache, T) do
compacttype(T, maxwidth)
end
end
end

function batch_compacttype(types::Vector{Any}, maxwidth::Int=8)
cache = Dict{Type, String}()
return map(types) do T
get!(cache, T) do
compacttype(T, maxwidth)
end
end
end

"""
compacttype(T::Type, maxwidth::Int=8, initial::Bool=true)

Return compact string representation of type `T`.

For displaying data frame we do not want string representation of type to be
longer than `maxwidth`. This function implements rules how type names are
cropped if they are longer than `maxwidth`.
"""
function compacttype(T::Type, maxwidth::Int=8)
maxwidth = max(8, maxwidth)

T === Any && return "Any"
Expand All @@ -82,8 +113,7 @@ function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true)
T = nonmissingtype(T)
sT = string(T)
suffix = "?"
# ignore "?" for initial width counting but respect it for display
initial || (maxwidth -= 1)
maxwidth -= 1
textwidth(sT) ≤ maxwidth && return sT * suffix
else
suffix = ""
Expand Down Expand Up @@ -119,93 +149,6 @@ function compacttype(T::Type, maxwidth::Int=8, initial::Bool=true)
return first(sT, stop) * "…" * suffix
end

"""
DataFrames.getmaxwidths(df::AbstractDataFrame,
io::IO,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
rowlabel::Symbol,
rowid::Union{Integer, Nothing},
show_eltype::Bool,
buffer::IOBuffer)

Calculate, for each column of an AbstractDataFrame, the maximum
string width used to render the name of that column, its type, and the
longest entry in that column -- among the rows of the data frame
will be rendered to IO. The widths for all columns are returned as a
vector.

Return a `Vector{Int}` giving the maximum string widths required to render
each column, including that column's name and type.

NOTE: The last entry of the result vector is the string width of the
implicit row ID column contained in every `AbstractDataFrame`.

# Arguments
- `df::AbstractDataFrame`: The data frame whose columns will be printed.
- `io::IO`: The `IO` to which `df` is to be printed
- `rowindices1::AbstractVector{Int}: A set of indices of the first
chunk of the AbstractDataFrame that would be rendered to IO.
- `rowindices2::AbstractVector{Int}: A set of indices of the second
chunk of the AbstractDataFrame that would be rendered to IO. Can
be empty if the AbstractDataFrame would be printed without any
ellipses.
- `rowlabel::AbstractString`: The label that will be used when rendered the
numeric ID's of each row. Typically, this will be set to "Row".
- `rowid`: Used to handle showing `DataFrameRow`.
- `show_eltype`: Whether to print the column type
under the column name in the heading.
- `buffer`: buffer passed around to avoid reallocations in `ourstrwidth`
"""
function getmaxwidths(df::AbstractDataFrame,
io::IO,
rowindices1::AbstractVector{Int},
rowindices2::AbstractVector{Int},
rowlabel::Symbol,
rowid::Union{Integer, Nothing},
show_eltype::Bool,
buffer::IOBuffer,
truncstring::Int)
maxwidths = Vector{Int}(undef, size(df, 2) + 1)

undefstrwidth = ourstrwidth(io, "#undef", buffer, truncstring)

j = 1
for (name, col) in pairs(eachcol(df))
# (1) Consider length of column name
# do not truncate column name
maxwidth = ourstrwidth(io, name, buffer, 0)

# (2) Consider length of longest entry in that column
for indices in (rowindices1, rowindices2), i in indices
if isassigned(col, i)
maxwidth = max(maxwidth, ourstrwidth(io, col[i], buffer, truncstring))
else
maxwidth = max(maxwidth, undefstrwidth)
end
end
if show_eltype
# do not truncate eltype name
maxwidths[j] = max(maxwidth, ourstrwidth(io, compacttype(eltype(col)), buffer, 0))
else
maxwidths[j] = maxwidth
end
j += 1
end

# do not truncate rowlabel
if rowid isa Nothing
rowmaxwidth1 = isempty(rowindices1) ? 0 : ndigits(maximum(rowindices1))
rowmaxwidth2 = isempty(rowindices2) ? 0 : ndigits(maximum(rowindices2))
maxwidths[j] = max(max(rowmaxwidth1, rowmaxwidth2),
ourstrwidth(io, rowlabel, buffer, 0))
else
maxwidths[j] = max(ndigits(rowid), ourstrwidth(io, rowlabel, buffer, 0))
end

return maxwidths
end

function _show(io::IO,
df::AbstractDataFrame;
allrows::Bool = !get(io, :limit, false),
Expand All @@ -220,13 +163,10 @@ function _show(io::IO,
_check_consistency(df)

names_str = names(df)
names_len = textwidth.(names_str)
maxwidth = max.(9, names_len)
types = eltype.(eachcol(df))

# NOTE: If we reuse `types` here, the time to print the first table is 2x
# more. This should be something related to type inference.
types_str = compacttype.(eltype.(eachcol(df)), maxwidth)
names_len = Int[textwidth(n) for n in names_str]
maxwidth = Int[max(9, nl) for nl in names_len]
types = Any[eltype(c) for c in eachcol(df)]
types_str = batch_compacttype(types, maxwidth)

if allcols && allrows
crop = :none
Expand Down