Skip to content

Performance regression since 10.0.0 #3674

Closed
@breyerml

Description

@breyerml

We saw a performance regression of a factor of 2-3 when switching from fmt version 8.1.0 to 10.0.0 (also happens with 10.1.1).

I extracted a MWE from our code:

cmake_minimum_required (VERSION 3.20)
project (Test)

set(CMAKE_CXX_STANDARD 17)

add_executable(prog main.cpp)

include(FetchContent)
FetchContent_Declare(fmt
        GIT_REPOSITORY https://github.com/fmtlib/fmt.git
        GIT_TAG 8.1.0
        GIT_SHALLOW TRUE
        QUIET

        set (FMT_TEST OFF CACHE BOOL "" FORCE)
        set (FMT_DOC OFF CACHE BOOL "" FORCE)
        set (FMT_INSTALL OFF CACHE BOOL "" FORCE)
        set (FMT_SYSTEM_HEADERS ON CACHE BOOL "" FORCE)
        )
FetchContent_MakeAvailable(fmt)
set_property(TARGET fmt PROPERTY POSITION_INDEPENDENT_CODE ON)
add_dependencies(prog fmt)
target_include_directories(prog PUBLIC ${fmt_SOURCE_DIR}/include)
target_link_libraries(prog PUBLIC fmt::fmt)

find_package(OpenMP REQUIRED)
target_compile_options(prog PUBLIC -fopenmp)
target_link_libraries(prog PUBLIC -fopenmp)
#include "fmt/compile.h"
#include "fmt/format.h"
#include "fmt/chrono.h"
#include "fmt/os.h"

#include "omp.h"

#include <vector>
#include <random>
#include <string>

using real_type = double;

template <typename T>
using matrix = std::vector<std::vector<T>>;

template <typename label_type>
inline void write(const std::string &filename, const std::vector<label_type> &labels, const matrix<real_type> &support_vectors) {
    
    const std::size_t num_features = support_vectors.front().size();
    const std::size_t num_classes = 2;
    const std::size_t num_alpha_per_point = num_classes;
    const std::vector<label_type> label_order = { 0, 1 };

    // create file
    fmt::ostream out = fmt::output_file(filename);
    // write timestamp for current date time
    // note: commented out since the resulting model file cannot be read be LIBSVM
    //    out.print("# This model file has been created at {}\n", detail::current_date_time());

    // the maximum size of one formatted LIBSVM entry, e.g., 1234:1.365363e+10
    // biggest number representable as std::size_t: 18446744073709551615 -> 20 chars
    // scientific notation: 3 chars (number in front of decimal separator including a sign + decimal separator) + 10 chars (part after the decimal separator, specified during formatting) +
    //                      5 chars exponent (e + sign + maximum potential exponent (308 -> 3 digits)
    // separators: 2 chars (: between index and feature + whitespace after feature value)
    // -> 40 chars in total
    // -> increased to 48 chars to be on the safe side
    static constexpr std::size_t CHARS_PER_BLOCK = 48;
    // results in 48 B * 128 B = 6 KiB stack buffer per thread
    static constexpr std::size_t BLOCK_SIZE = 128;
    // use 1 MiB as buffer per thread
    constexpr std::size_t STRING_BUFFER_SIZE = 1024 * 1024;

    // format one output-line
    auto format_libsvm_line = [](std::string &output, const std::vector<real_type> &a, const matrix<real_type> &d, const std::size_t point) {
        static constexpr std::size_t STACK_BUFFER_SIZE = BLOCK_SIZE * CHARS_PER_BLOCK;
        static char buffer[STACK_BUFFER_SIZE];
        #pragma omp threadprivate(buffer)

        output.append(fmt::format("{:.10e} ", fmt::join(a, " ")));
        for (typename std::vector<real_type>::size_type j = 0; j < d.front().size(); j += BLOCK_SIZE) {
            char *ptr = buffer;
            for (std::size_t i = 0; i < std::min<std::size_t>(BLOCK_SIZE, d.front().size() - j); ++i) {
                if (d[point][j + i] != real_type{ 0.0 }) {
                    // add 1 to the index since LIBSVM assumes 1-based feature indexing
                    ptr = fmt::format_to(ptr, FMT_COMPILE("{}:{:.10e} "), j + i + 1, d[point][j + i]);
                }
            }
            output.append(buffer, ptr - buffer);
        }
        output.push_back('\n');
    };

    // initialize volatile array
    auto counts = std::make_unique<volatile int[]>(label_order.size() + 1);
    counts[0] = std::numeric_limits<int>::max();
    #pragma omp parallel default(none) shared(counts, format_libsvm_line, label_order, labels, support_vectors, out) firstprivate(BLOCK_SIZE, CHARS_PER_BLOCK, num_features, num_classes, num_alpha_per_point)
    {
        // preallocate string buffer, only ONE allocation
        std::string out_string;
        out_string.reserve(STRING_BUFFER_SIZE + (num_features + num_alpha_per_point) * CHARS_PER_BLOCK);  // oversubscribe buffer that at least one additional line fits into it
        std::vector<real_type> alpha_per_point(num_alpha_per_point);

        // loop over all classes, since they must be sorted
        for (typename std::vector<label_type>::size_type l = 0; l < label_order.size(); ++l) {
            // the support vectors with the l-th class
            #pragma omp for nowait
            for (typename std::vector<real_type>::size_type i = 0; i < support_vectors.size(); ++i) {
                if (labels[i] == label_order[l]) {
                    format_libsvm_line(out_string, alpha_per_point, support_vectors, i);

                    // if the buffer is full, write it to the file
                    if (out_string.size() > STRING_BUFFER_SIZE) {
                        // wait for all threads to write support vectors for previous class
#ifdef _OPENMP
                        while (counts[l] < omp_get_num_threads()) {
                        }
#endif
                        #pragma omp critical
                        {
                            out.print("{}", out_string);
                        }
                        // clear buffer
                        out_string.clear();
                    }
                }
            }
            // wait for all threads to write support vectors for previous class
#ifdef _OPENMP
            while (counts[l] < omp_get_num_threads()) {
            }
#endif

            #pragma omp critical
            {
                if (!out_string.empty()) {
                    out.print("{}", out_string);
                    out_string.clear();
                }
                counts[l + 1] = counts[l + 1] + 1;
            }
        }
    }
}



int main() {

  std::random_device rd;
  std::mt19937 mt(rd());
  std::uniform_real_distribution<double> dist(-1.0, 1.0);

  std::vector<int> labels(73257);
  matrix<real_type> support_vectors(73257, std::vector<real_type>(3072));

  for (std::size_t i = 0; i < labels.size(); ++i) {
    labels[i] = i % 2;
    for (std::size_t j = 0; j < support_vectors.front().size(); ++j) {
      support_vectors[i][j] = dist(mt);
    }
  }


  auto start = std::chrono::steady_clock::now();
  write("test.libsvm", labels, support_vectors);
  auto end = std::chrono::steady_clock::now();
  fmt::print("{}\n", std::chrono::duration_cast<std::chrono::milliseconds>(end - start));

  return 0;
}

I compiled the above code with g++ 11.4.0: cmake -DCMAKE_BUILD_TYPE=Release .. && make

On our machine (AMD EPYC 7763 64-Core Processor) I get the following runtimes for the MWE writing to a SSD:
8.1.0: 4067ms
9.1.0: 4093ms
10.0.0: 12954ms
10.1.1: 13009ms

Am I missing something or shouldn't there be such a loss in performance simply by using a newer {fmt} version?

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions