Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
115 changes: 43 additions & 72 deletions gptables/core/wrappers.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import re
import warnings
from copy import deepcopy
from math import ceil

import numpy as np
import pandas as pd
from xlsxwriter.utility import cell_autofit_width
from xlsxwriter.workbook import Workbook
from xlsxwriter.worksheet import Worksheet

Expand Down Expand Up @@ -850,12 +852,12 @@ def _set_column_widths(self, widths):
Set the column widths using a list of widths.
"""
for col_number in range(len(widths)):
self.set_column(col_number, col_number, widths[col_number])
self.set_column_pixels(col_number, col_number, widths[col_number])

def _calculate_column_widths(self, table, formats_table):
"""
Calculate Excel column widths using maximum length of strings
and the maximum font size in each column of the data table.
Calculate Excel column widths using xlsxwriter's cell_autofit_width for each cell,
and take the maximum per column.

Parameters
----------
Expand All @@ -870,87 +872,56 @@ def _calculate_column_widths(self, table, formats_table):
width to apply to Excel columns
"""
cols = table.shape[1]
max_lengths = [
table.iloc[:, col].apply(self._longest_line_length).max()
for col in range(cols)
]

max_font_sizes = [
formats_table.iloc[:, col].apply(lambda x: x.get("font_size") or 10).max()
for col in range(cols)
]

col_widths = [
self._excel_string_width(leng, f)
for leng, f in zip(max_lengths, max_font_sizes)
]
col_widths = []
for col in range(cols):
cell_widths = []
for row in range(table.shape[0]):
cell_val = table.iloc[row, col]
longest_line = self._get_longest_line(cell_val)
format_dict = formats_table.iloc[row, col]
scaling_factor = self._get_scaling_factor(format_dict, longest_line)
width = ceil(cell_autofit_width(longest_line) * scaling_factor)
cell_widths.append(width)
col_widths.append(max(cell_widths) if cell_widths else 0)
return col_widths

@staticmethod
def _excel_string_width(string_len, font_size):
"""
Calculate the rough length of a string in Excel character units.
This crude estimate does not account for font name or other font format
(e.g. wrapping).

Parameters
----------
string_len : int
length of string to calculate width in Excel for
font_size : int
size of font
def _get_scaling_factor(self, format_dict, text):
"""Return scaling factor for width based on font size, bold formatting,
and capitalisation."""
font_size = (
format_dict.get("font_size", 11) if isinstance(format_dict, dict) else 11
)
bold = (
format_dict.get("bold", False) if isinstance(format_dict, dict) else False
)

Returns
-------
excel_width : float
width of equivalent string in Excel
"""
if string_len == 0:
excel_width = 0
if text and isinstance(text, str):
num_upper = sum(1 for c in text if c.isupper())
upper_ratio = num_upper / len(text) if len(text) > 0 else 0
else:
excel_width = string_len * ((font_size * 0.12) - 0.09)
upper_ratio = 0
capitalisation_factor = 1.0 + 0.15 * upper_ratio
return (font_size / 11) * (1.1 if bold else 1.0) * capitalisation_factor

return excel_width

def _longest_line_length(self, cell_val):
"""
Calculate the length of the longest line within a cell.
If the cell contains a string, the longest length between line breaks is returned.
If the cell contains a float or integer, the longest length is calculated from the cell_value cast to a string.
If the cell contains a link formatted as {display_text: link}, the longest length is calculated from the display text.
If the cell contains a list of strings, the length of the longest string in the list is returned.
Expects new lines to be marked with "\n", "\r\n" or new lines in multiline strings.

Parameters
----------
cell_val:
cell value

Returns
-------
max_length: int
the length of the longest line within the string
"""
split_strings = """
|\r\n|\n"""
def _get_longest_line(self, cell_val):
"""Return the longest line in a cell value split by newline."""
cell_val_str = self._get_cell_string(cell_val)
return max(cell_val_str.split("\n"), key=len)

def _get_cell_string(self, cell_val):
"""Return the contents from any cell value as a string."""
if isinstance(cell_val, str):
max_length = max([len(line) for line in re.split(split_strings, cell_val)])
return cell_val
elif isinstance(cell_val, (float, int)):
max_length = self._longest_line_length(str(cell_val))
return str(cell_val)
elif isinstance(cell_val, dict):
max_length = self._longest_line_length(list(cell_val)[0])
return "\n".join([self._get_cell_string(k) for k in cell_val.keys()])
elif isinstance(cell_val, FormatList):
max_length = self._longest_line_length(cell_val.string)
return self._get_cell_string(cell_val.string)
elif isinstance(cell_val, list):
if isinstance(cell_val[0], (dict, FormatList)):
max_length = self._longest_line_length(cell_val[0])
else:
max_length = max([len(line) for line in cell_val])
return "\n".join([self._get_cell_string(item) for item in cell_val])
else:
max_length = 0

return max_length
return str(cell_val) if cell_val else ""


class GPWorkbook(Workbook):
Expand Down
Binary file modified gptables/test/test_api/test_end_to_end.xlsx
Binary file not shown.
108 changes: 81 additions & 27 deletions gptables/test/test_wrappers.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,47 +446,101 @@ def test__mark_data_as_worksheet_table(self, testbook, create_gptable_with_kwarg
assert got_heading_format.__dict__ == exp_heading_format.__dict__

@pytest.mark.parametrize(
"cell_val,exp_length",
"data,format,exp_width",
[
("string", 6),
(42, 2),
(3.14, 4),
({"gov.uk": "https://www.gov.uk"}, 6),
(FormatList(["Partially ", {"bold": True}, "bold", " string"]), 21),
(["string", "another string"], 14),
("string\nwith\nnewlines", 8),
(FormatList(["string\r\n", {"bold": True}, "bold string"]), 11),
(set(), 0),
# Single column, normal case
(["string", "longer string"], [{"font_size": 12}, {"font_size": 12}], [93]),
# Multiple columns
(
pd.DataFrame({"col1": ["a", "bb"], "col2": ["ccc", "dddd"]}),
pd.DataFrame(
{
"col1": [{"font_size": 11}, {"font_size": 12}],
"col2": [{"font_size": 10}, {"font_size": 14}],
}
),
[26, 50],
),
# Bold formatting
(
["bold", "bolder"],
[{"font_size": 11, "bold": True}, {"font_size": 12, "bold": True}],
[58],
),
# Multi-line cell
(
["short\nlongest\nmid", "tiny"],
[{"font_size": 11}, {"font_size": 11}],
[53],
),
# Empty string
(["", ""], [{"font_size": 11}, {"font_size": 11}], [0]),
# Number cell
([123, 4567], [{"font_size": 11}, {"font_size": 11}], [35]),
],
)
def test__longest_line_length(self, testbook, cell_val, exp_length):
got_length = testbook.ws._longest_line_length(cell_val)
def test__calculate_column_widths(self, testbook, data, format, exp_width):
if isinstance(data, pd.DataFrame):
table = data
table_format = format
else:
table = pd.DataFrame({"col": data})
table_format = pd.DataFrame({"col": format})

assert got_length == exp_length
got_width = testbook.ws._calculate_column_widths(table, table_format)
assert got_width == exp_width
assert all(isinstance(w, int) for w in got_width)

@pytest.mark.parametrize(
"data",
"format_dict,longest_line,expected",
[
["string", "longer string"],
["longer string", "longer string"],
["string\nstring\nstring", "longer string"],
({"font_size": 11, "bold": False}, "abc", 1.0),
({"font_size": 12, "bold": False}, "abc", 12 / 11),
({"font_size": 11, "bold": True}, "abc", 1.1),
({"font_size": 12, "bold": True}, "abc", (12 / 11) * 1.1),
({}, "abc", 1.0),
({"font_size": 11, "bold": False}, "ABC", 1.0 * (1 + 0.15 * 1)),
({"font_size": 11, "bold": False}, "AbC", 1.0 * (1 + 0.15 * (2 / 3))),
({"font_size": 11, "bold": True}, "ALLCAPS", 1.1 * (1 + 0.15 * 1)),
(
{"font_size": 12, "bold": True},
"MiXeD",
(12 / 11) * 1.1 * (1 + 0.15 * (3 / 5)),
),
({"font_size": 11, "bold": False}, "lower", 1.0),
],
)
def test__get_scaling_factor(self, testbook, format_dict, longest_line, expected):
got = testbook.ws._get_scaling_factor(format_dict, longest_line)
assert got == expected

@pytest.mark.parametrize(
"format",
"cell_val,expected",
[
[{"font_size": 12}, {"font_size": 12}],
[{"font_size": 10}, {"font_size": 12}],
("short\nlongest\nmid", "longest"),
("one line", "one line"),
(["a", "bb", "ccc"], "ccc"),
("a\nbb\nccc", "ccc"),
],
)
def test__calculate_column_widths(self, testbook, data, format):
table = pd.DataFrame({"col": data})
table_format = pd.DataFrame({"col": format})

got_width = testbook.ws._calculate_column_widths(table, table_format)
exp_width = [testbook.ws._excel_string_width(string_len=13, font_size=12)]
def test__get_longest_line(self, testbook, cell_val, expected):
got = testbook.ws._get_longest_line(cell_val)
assert got == expected

assert got_width == exp_width
@pytest.mark.parametrize(
"cell_val,expected",
[
("abc", "abc"),
(123, "123"),
(["a", "b", "c"], "a\nb\nc"),
({"x": 1, "y": 2}, "x\ny"),
(pd.Timestamp("2023-09-30 12:34:56"), "2023-09-30 12:34:56"),
],
)
def test__get_cell_string(self, testbook, cell_val, expected):
# Patch FormatList handling if needed
got = testbook.ws._get_cell_string(cell_val)
assert got == expected


class TestGPWorkbookStatic:
Expand Down