ONSdigital · Sara-Jade-O · Oct 13, 2025 · Sep 23, 2025 · Sep 23, 2025 · Sep 23, 2025
@@ -1,9 +1,11 @@
 import re
 import warnings
 from copy import deepcopy
+from math import ceil
 
 import numpy as np
 import pandas as pd
+from xlsxwriter.utility import cell_autofit_width
 from xlsxwriter.workbook import Workbook
 from xlsxwriter.worksheet import Worksheet
 
@@ -850,12 +852,12 @@ def _set_column_widths(self, widths):
         Set the column widths using a list of widths.
         """
         for col_number in range(len(widths)):
-            self.set_column(col_number, col_number, widths[col_number])
+            self.set_column_pixels(col_number, col_number, widths[col_number])
 
     def _calculate_column_widths(self, table, formats_table):
         """
-        Calculate Excel column widths using maximum length of strings
-        and the maximum font size in each column of the data table.
+        Calculate Excel column widths using xlsxwriter's cell_autofit_width for each cell,
+        and take the maximum per column.
 
         Parameters
         ----------
@@ -870,87 +872,56 @@ def _calculate_column_widths(self, table, formats_table):
             width to apply to Excel columns
         """
         cols = table.shape[1]
-        max_lengths = [
-            table.iloc[:, col].apply(self._longest_line_length).max()
-            for col in range(cols)
-        ]
-
-        max_font_sizes = [
-            formats_table.iloc[:, col].apply(lambda x: x.get("font_size") or 10).max()
-            for col in range(cols)
-        ]
-
-        col_widths = [
-            self._excel_string_width(leng, f)
-            for leng, f in zip(max_lengths, max_font_sizes)
-        ]
+        col_widths = []
+        for col in range(cols):
+            cell_widths = []
+            for row in range(table.shape[0]):
+                cell_val = table.iloc[row, col]
+                longest_line = self._get_longest_line(cell_val)
+                format_dict = formats_table.iloc[row, col]
+                scaling_factor = self._get_scaling_factor(format_dict, longest_line)
+                width = ceil(cell_autofit_width(longest_line) * scaling_factor)
+                cell_widths.append(width)
+            col_widths.append(max(cell_widths) if cell_widths else 0)
         return col_widths
 
-    @staticmethod
-    def _excel_string_width(string_len, font_size):
-        """
-        Calculate the rough length of a string in Excel character units.
-        This crude estimate does not account for font name or other font format
-        (e.g. wrapping).
-
-        Parameters
-        ----------
-        string_len : int
-            length of string to calculate width in Excel for
-        font_size : int
-            size of font
+    def _get_scaling_factor(self, format_dict, text):
+        """Return scaling factor for width based on font size, bold formatting,
+        and capitalisation."""
+        font_size = (
+            format_dict.get("font_size", 11) if isinstance(format_dict, dict) else 11
+        )
+        bold = (
+            format_dict.get("bold", False) if isinstance(format_dict, dict) else False
+        )
 
-        Returns
-        -------
-        excel_width : float
-            width of equivalent string in Excel
-        """
-        if string_len == 0:
-            excel_width = 0
+        if text and isinstance(text, str):
+            num_upper = sum(1 for c in text if c.isupper())
+            upper_ratio = num_upper / len(text) if len(text) > 0 else 0
         else:
-            excel_width = string_len * ((font_size * 0.12) - 0.09)
+            upper_ratio = 0
+        capitalisation_factor = 1.0 + 0.15 * upper_ratio
+        return (font_size / 11) * (1.1 if bold else 1.0) * capitalisation_factor
 
-        return excel_width
-
-    def _longest_line_length(self, cell_val):
-        """
-        Calculate the length of the longest line within a cell.
-        If the cell contains a string, the longest length between line breaks is returned.
-        If the cell contains a float or integer, the longest length is calculated from the cell_value cast to a string.
-        If the cell contains a link formatted as {display_text: link}, the longest length is calculated from the display text.
-        If the cell contains a list of strings, the length of the longest string in the list is returned.
-        Expects new lines to be marked with "\n", "\r\n" or new lines in multiline strings.
-
-        Parameters
-        ----------
-        cell_val:
-            cell value
-
-        Returns
-        -------
-        max_length: int
-            the length of the longest line within the string
-        """
-        split_strings = """
-|\r\n|\n"""
+    def _get_longest_line(self, cell_val):
+        """Return the longest line in a cell value split by newline."""
+        cell_val_str = self._get_cell_string(cell_val)
+        return max(cell_val_str.split("\n"), key=len)
 
+    def _get_cell_string(self, cell_val):
+        """Return the contents from any cell value as a string."""
         if isinstance(cell_val, str):
-            max_length = max([len(line) for line in re.split(split_strings, cell_val)])
+            return cell_val
         elif isinstance(cell_val, (float, int)):
-            max_length = self._longest_line_length(str(cell_val))
+            return str(cell_val)
         elif isinstance(cell_val, dict):
-            max_length = self._longest_line_length(list(cell_val)[0])
+            return "\n".join([self._get_cell_string(k) for k in cell_val.keys()])
         elif isinstance(cell_val, FormatList):
-            max_length = self._longest_line_length(cell_val.string)
+            return self._get_cell_string(cell_val.string)
         elif isinstance(cell_val, list):
-            if isinstance(cell_val[0], (dict, FormatList)):
-                max_length = self._longest_line_length(cell_val[0])
-            else:
-                max_length = max([len(line) for line in cell_val])
+            return "\n".join([self._get_cell_string(item) for item in cell_val])
         else:
-            max_length = 0
-
-        return max_length
+            return str(cell_val) if cell_val else ""
 
 
 class GPWorkbook(Workbook):

@@ -446,47 +446,101 @@ def test__mark_data_as_worksheet_table(self, testbook, create_gptable_with_kwarg
             assert got_heading_format.__dict__ == exp_heading_format.__dict__
 
     @pytest.mark.parametrize(
-        "cell_val,exp_length",
+        "data,format,exp_width",
         [
-            ("string", 6),
-            (42, 2),
-            (3.14, 4),
-            ({"gov.uk": "https://www.gov.uk"}, 6),
-            (FormatList(["Partially ", {"bold": True}, "bold", " string"]), 21),
-            (["string", "another string"], 14),
-            ("string\nwith\nnewlines", 8),
-            (FormatList(["string\r\n", {"bold": True}, "bold string"]), 11),
-            (set(), 0),
+            # Single column, normal case
+            (["string", "longer string"], [{"font_size": 12}, {"font_size": 12}], [93]),
+            # Multiple columns
+            (
+                pd.DataFrame({"col1": ["a", "bb"], "col2": ["ccc", "dddd"]}),
+                pd.DataFrame(
+                    {
+                        "col1": [{"font_size": 11}, {"font_size": 12}],
+                        "col2": [{"font_size": 10}, {"font_size": 14}],
+                    }
+                ),
+                [26, 50],
+            ),
+            # Bold formatting
+            (
+                ["bold", "bolder"],
+                [{"font_size": 11, "bold": True}, {"font_size": 12, "bold": True}],
+                [58],
+            ),
+            # Multi-line cell
+            (
+                ["short\nlongest\nmid", "tiny"],
+                [{"font_size": 11}, {"font_size": 11}],
+                [53],
+            ),
+            # Empty string
+            (["", ""], [{"font_size": 11}, {"font_size": 11}], [0]),
+            # Number cell
+            ([123, 4567], [{"font_size": 11}, {"font_size": 11}], [35]),
         ],
     )
-    def test__longest_line_length(self, testbook, cell_val, exp_length):
-        got_length = testbook.ws._longest_line_length(cell_val)
+    def test__calculate_column_widths(self, testbook, data, format, exp_width):
+        if isinstance(data, pd.DataFrame):
+            table = data
+            table_format = format
+        else:
+            table = pd.DataFrame({"col": data})
+            table_format = pd.DataFrame({"col": format})
 
-        assert got_length == exp_length
+        got_width = testbook.ws._calculate_column_widths(table, table_format)
+        assert got_width == exp_width
+        assert all(isinstance(w, int) for w in got_width)
 
     @pytest.mark.parametrize(
-        "data",
+        "format_dict,longest_line,expected",
         [
-            ["string", "longer string"],
-            ["longer string", "longer string"],
-            ["string\nstring\nstring", "longer string"],
+            ({"font_size": 11, "bold": False}, "abc", 1.0),
+            ({"font_size": 12, "bold": False}, "abc", 12 / 11),
+            ({"font_size": 11, "bold": True}, "abc", 1.1),
+            ({"font_size": 12, "bold": True}, "abc", (12 / 11) * 1.1),
+            ({}, "abc", 1.0),
+            ({"font_size": 11, "bold": False}, "ABC", 1.0 * (1 + 0.15 * 1)),
+            ({"font_size": 11, "bold": False}, "AbC", 1.0 * (1 + 0.15 * (2 / 3))),
+            ({"font_size": 11, "bold": True}, "ALLCAPS", 1.1 * (1 + 0.15 * 1)),
+            (
+                {"font_size": 12, "bold": True},
+                "MiXeD",
+                (12 / 11) * 1.1 * (1 + 0.15 * (3 / 5)),
+            ),
+            ({"font_size": 11, "bold": False}, "lower", 1.0),
         ],
     )
+    def test__get_scaling_factor(self, testbook, format_dict, longest_line, expected):
+        got = testbook.ws._get_scaling_factor(format_dict, longest_line)
+        assert got == expected
+
     @pytest.mark.parametrize(
-        "format",
+        "cell_val,expected",
         [
-            [{"font_size": 12}, {"font_size": 12}],
-            [{"font_size": 10}, {"font_size": 12}],
+            ("short\nlongest\nmid", "longest"),
+            ("one line", "one line"),
+            (["a", "bb", "ccc"], "ccc"),
+            ("a\nbb\nccc", "ccc"),
         ],
     )
-    def test__calculate_column_widths(self, testbook, data, format):
-        table = pd.DataFrame({"col": data})
-        table_format = pd.DataFrame({"col": format})
-
-        got_width = testbook.ws._calculate_column_widths(table, table_format)
-        exp_width = [testbook.ws._excel_string_width(string_len=13, font_size=12)]
+    def test__get_longest_line(self, testbook, cell_val, expected):
+        got = testbook.ws._get_longest_line(cell_val)
+        assert got == expected
 
-        assert got_width == exp_width
+    @pytest.mark.parametrize(
+        "cell_val,expected",
+        [
+            ("abc", "abc"),
+            (123, "123"),
+            (["a", "b", "c"], "a\nb\nc"),
+            ({"x": 1, "y": 2}, "x\ny"),
+            (pd.Timestamp("2023-09-30 12:34:56"), "2023-09-30 12:34:56"),
+        ],
+    )
+    def test__get_cell_string(self, testbook, cell_val, expected):
+        # Patch FormatList handling if needed
+        got = testbook.ws._get_cell_string(cell_val)
+        assert got == expected
 
 
 class TestGPWorkbookStatic: