Skip to content

Unittests and usability for BitstreamWriter incremental flushing #92983

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 10 commits into from
May 30, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions llvm/include/llvm/Bitcode/BitcodeWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,6 @@ class Module;
class raw_ostream;

class BitcodeWriter {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ideally don't change the spacing in this PR, because it creates a lot of spurious diffs.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

(it's really hard for me to identify what are the actual change in this class)

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

fixed the formatting upstream, then merged.

SmallVectorImpl<char> &Buffer;
std::unique_ptr<BitstreamWriter> Stream;

StringTableBuilder StrtabBuilder{StringTableBuilder::RAW};
Expand All @@ -47,7 +46,8 @@ class BitcodeWriter {

public:
/// Create a BitcodeWriter that writes to Buffer.
BitcodeWriter(SmallVectorImpl<char> &Buffer, raw_fd_stream *FS = nullptr);
BitcodeWriter(SmallVectorImpl<char> &Buffer);
BitcodeWriter(raw_ostream &FS);

~BitcodeWriter();

Expand Down
167 changes: 120 additions & 47 deletions llvm/include/llvm/Bitstream/BitstreamWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include "llvm/ADT/SmallVector.h"
#include "llvm/ADT/StringRef.h"
#include "llvm/Bitstream/BitCodes.h"
#include "llvm/Support/Casting.h"
#include "llvm/Support/Endian.h"
#include "llvm/Support/MathExtras.h"
#include "llvm/Support/raw_ostream.h"
Expand All @@ -28,34 +29,44 @@
namespace llvm {

class BitstreamWriter {
/// Out - The buffer that keeps unflushed bytes.
SmallVectorImpl<char> &Out;

/// FS - The file stream that Out flushes to. If FS is nullptr, it does not
/// support read or seek, Out cannot be flushed until all data are written.
raw_fd_stream *FS;

/// FlushThreshold - If FS is valid, this is the threshold (unit B) to flush
/// FS.
/// Owned buffer, used to init Buffer if the provided stream doesn't happen to
/// be a buffer itself.
SmallVector<char, 0> OwnBuffer;
/// Internal buffer for unflushed bytes (unless there is no stream to flush
/// to, case in which these are "the bytes"). The writer backpatches, so it is
/// efficient to buffer.
SmallVectorImpl<char> &Buffer;

/// FS - The file stream that Buffer flushes to. If FS is a raw_fd_stream, the
/// writer will incrementally flush at subblock boundaries. Otherwise flushing
/// will happen at the end of BitstreamWriter's lifetime.
raw_ostream *const FS;

/// FlushThreshold - this is the threshold (unit B) to flush to FS, if FS is a
/// raw_fd_stream.
const uint64_t FlushThreshold;

/// CurBit - Always between 0 and 31 inclusive, specifies the next bit to use.
unsigned CurBit;
unsigned CurBit = 0;

/// CurValue - The current value. Only bits < CurBit are valid.
uint32_t CurValue;
uint32_t CurValue = 0;

/// CurCodeSize - This is the declared size of code values used for the
/// current block, in bits.
unsigned CurCodeSize;
unsigned CurCodeSize = 2;

/// BlockInfoCurBID - When emitting a BLOCKINFO_BLOCK, this is the currently
/// selected BLOCK ID.
unsigned BlockInfoCurBID;
unsigned BlockInfoCurBID = 0;

/// CurAbbrevs - Abbrevs installed at in this block.
std::vector<std::shared_ptr<BitCodeAbbrev>> CurAbbrevs;

// Support for retrieving a section of the output, for purposes such as
// checksumming.
std::optional<size_t> BlockFlushingStartPos;

struct Block {
unsigned PrevCodeSize;
size_t StartSizeWord;
Expand All @@ -77,47 +88,106 @@ class BitstreamWriter {
void WriteWord(unsigned Value) {
Value =
support::endian::byte_swap<uint32_t, llvm::endianness::little>(Value);
Out.append(reinterpret_cast<const char *>(&Value),
reinterpret_cast<const char *>(&Value + 1));
Buffer.append(reinterpret_cast<const char *>(&Value),
reinterpret_cast<const char *>(&Value + 1));
}

uint64_t GetNumOfFlushedBytes() const { return FS ? FS->tell() : 0; }
uint64_t GetNumOfFlushedBytes() const {
return fdStream() ? fdStream()->tell() : 0;
}

size_t GetBufferOffset() const { return Out.size() + GetNumOfFlushedBytes(); }
size_t GetBufferOffset() const {
return Buffer.size() + GetNumOfFlushedBytes();
}

size_t GetWordIndex() const {
size_t Offset = GetBufferOffset();
assert((Offset & 3) == 0 && "Not 32-bit aligned");
return Offset / 4;
}

/// If the related file stream supports reading, seeking and writing, flush
/// the buffer if its size is above a threshold.
void FlushToFile() {
if (!FS)
void flushAndClear() {
assert(FS);
assert(!Buffer.empty());
assert(!BlockFlushingStartPos &&
"a call to markAndBlockFlushing should have been paired with a "
"call to getMarkedBufferAndResumeFlushing");
FS->write(Buffer.data(), Buffer.size());
Buffer.clear();
}

/// If the related file stream is a raw_fd_stream, flush the buffer if its
/// size is above a threshold. If \p OnClosing is true, flushing happens
/// regardless of thresholds.
void FlushToFile(bool OnClosing = false) {
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Suggest adding a brief comment about new parameter meaning/behavior.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

done

if (!FS || Buffer.empty())
return;
if (Out.size() < FlushThreshold)
if (OnClosing)
return flushAndClear();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What prevents this from being called when FS==nullptr, which will cause an assert in flushAndClear()?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

the line above - we do nothing if FS == nullptr

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

woops, missed that

if (BlockFlushingStartPos)
return;
FS->write((char *)&Out.front(), Out.size());
Out.clear();
if (fdStream() && Buffer.size() > FlushThreshold)
flushAndClear();
}

raw_fd_stream *fdStream() { return dyn_cast_or_null<raw_fd_stream>(FS); }

const raw_fd_stream *fdStream() const {
return dyn_cast_or_null<raw_fd_stream>(FS);
}

SmallVectorImpl<char> &getInternalBufferFromStream(raw_ostream &OutStream) {
if (auto *SV = dyn_cast<raw_svector_ostream>(&OutStream))
return SV->buffer();
return OwnBuffer;
}

public:
/// Create a BitstreamWriter that writes to Buffer \p O.
/// Create a BitstreamWriter over a raw_ostream \p OutStream.
/// If \p OutStream is a raw_svector_ostream, the BitstreamWriter will write
/// directly to the latter's buffer. In all other cases, the BitstreamWriter
/// will use an internal buffer and flush at the end of its lifetime.
///
/// \p FS is the file stream that \p O flushes to incrementally. If \p FS is
/// null, \p O does not flush incrementially, but writes to disk at the end.
/// In addition, if \p is a raw_fd_stream supporting seek, tell, and read
/// (besides write), the BitstreamWriter will also flush incrementally, when a
/// subblock is finished, and if the FlushThreshold is passed.
///
/// \p FlushThreshold is the threshold (unit M) to flush \p O if \p FS is
/// valid. Flushing only occurs at (sub)block boundaries.
BitstreamWriter(SmallVectorImpl<char> &O, raw_fd_stream *FS = nullptr,
uint32_t FlushThreshold = 512)
: Out(O), FS(FS), FlushThreshold(uint64_t(FlushThreshold) << 20), CurBit(0),
CurValue(0), CurCodeSize(2) {}
/// NOTE: \p FlushThreshold's unit is MB.
BitstreamWriter(raw_ostream &OutStream, uint32_t FlushThreshold = 512)
: Buffer(getInternalBufferFromStream(OutStream)),
FS(!isa<raw_svector_ostream>(OutStream) ? &OutStream : nullptr),
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

There are many classes derived from raw_ostream - is raw_svector_ostream the only one that needs to be treated specially?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, because it is really a buffer dressed as a stream. So the intention of the user would be to capture the bytes in that buffer.

We also special-case raw_fd_stream for the incremental flushing scenario.

FlushThreshold(uint64_t(FlushThreshold) << 20) {}

/// Convenience constructor for users that start with a vector - avoids
/// needing to wrap it in a raw_svector_ostream.
BitstreamWriter(SmallVectorImpl<char> &Buff)
: Buffer(Buff), FS(nullptr), FlushThreshold(0) {}

~BitstreamWriter() {
assert(CurBit == 0 && "Unflushed data remaining");
FlushToWord();
assert(BlockScope.empty() && CurAbbrevs.empty() && "Block imbalance");
FlushToFile(/*OnClosing=*/true);
}

/// For scenarios where the user wants to access a section of the stream to
/// (for example) compute some checksum, disable flushing and remember the
/// position in the internal buffer where that happened. Must be paired with a
/// call to getMarkedBufferAndResumeFlushing.
void markAndBlockFlushing() {
assert(!BlockFlushingStartPos);
BlockFlushingStartPos = Buffer.size();
}

/// resumes flushing, but does not flush, and returns the section in the
/// internal buffer starting from the position marked with
/// markAndBlockFlushing. The return should be processed before any additional
/// calls to this object, because those may cause a flush and invalidate the
/// return.
StringRef getMarkedBufferAndResumeFlushing() {
assert(BlockFlushingStartPos);
size_t Start = *BlockFlushingStartPos;
BlockFlushingStartPos.reset();
return {&Buffer[Start], Buffer.size() - Start};
}

/// Retrieve the current position in the stream, in bits.
Expand All @@ -141,16 +211,19 @@ class BitstreamWriter {
if (ByteNo >= NumOfFlushedBytes) {
assert((!endian::readAtBitAlignment<uint8_t, llvm::endianness::little,
unaligned>(
&Out[ByteNo - NumOfFlushedBytes], StartBit)) &&
&Buffer[ByteNo - NumOfFlushedBytes], StartBit)) &&
"Expected to be patching over 0-value placeholders");
endian::writeAtBitAlignment<uint8_t, llvm::endianness::little, unaligned>(
&Out[ByteNo - NumOfFlushedBytes], NewByte, StartBit);
&Buffer[ByteNo - NumOfFlushedBytes], NewByte, StartBit);
return;
}

// If we don't have a raw_fd_stream, GetNumOfFlushedBytes() should have
// returned 0, and we shouldn't be here.
assert(fdStream() != nullptr);
// If the byte offset to backpatch is flushed, use seek to backfill data.
// First, save the file position to restore later.
uint64_t CurPos = FS->tell();
uint64_t CurPos = fdStream()->tell();
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

For this change and at least one below, do we need to invoke fdStream() or can it simply continue to use FD which should have been initialized in the constructor?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we could use FD, but we should only be here if fdStream() != nullptr. The reason that happens is not very apparent, though: GetNumOfFlushedBytes returns 0 in that case, which makes the if-check above always succeed. Added an assert and a comment.


// Copy data to update into Bytes from the file FS and the buffer Out.
char Bytes[3]; // Use one more byte to silence a warning from Visual C++.
Expand All @@ -159,19 +232,19 @@ class BitstreamWriter {
size_t BytesFromBuffer = BytesNum - BytesFromDisk;

// When unaligned, copy existing data into Bytes from the file FS and the
// buffer Out so that it can be updated before writing. For debug builds
// buffer Buffer so that it can be updated before writing. For debug builds
// read bytes unconditionally in order to check that the existing value is 0
// as expected.
#ifdef NDEBUG
if (StartBit)
#endif
{
FS->seek(ByteNo);
ssize_t BytesRead = FS->read(Bytes, BytesFromDisk);
fdStream()->seek(ByteNo);
ssize_t BytesRead = fdStream()->read(Bytes, BytesFromDisk);
(void)BytesRead; // silence warning
assert(BytesRead >= 0 && static_cast<size_t>(BytesRead) == BytesFromDisk);
for (size_t i = 0; i < BytesFromBuffer; ++i)
Bytes[BytesFromDisk + i] = Out[i];
Bytes[BytesFromDisk + i] = Buffer[i];
assert((!endian::readAtBitAlignment<uint8_t, llvm::endianness::little,
unaligned>(Bytes, StartBit)) &&
"Expected to be patching over 0-value placeholders");
Expand All @@ -182,13 +255,13 @@ class BitstreamWriter {
Bytes, NewByte, StartBit);

// Copy updated data back to the file FS and the buffer Out.
FS->seek(ByteNo);
FS->write(Bytes, BytesFromDisk);
fdStream()->seek(ByteNo);
fdStream()->write(Bytes, BytesFromDisk);
for (size_t i = 0; i < BytesFromBuffer; ++i)
Out[i] = Bytes[BytesFromDisk + i];
Buffer[i] = Bytes[BytesFromDisk + i];

// Restore the file position.
FS->seek(CurPos);
fdStream()->seek(CurPos);
}

void BackpatchHalfWord(uint64_t BitNo, uint16_t Val) {
Expand Down Expand Up @@ -481,11 +554,11 @@ class BitstreamWriter {

// Emit literal bytes.
assert(llvm::all_of(Bytes, [](UIntTy B) { return isUInt<8>(B); }));
Out.append(Bytes.begin(), Bytes.end());
Buffer.append(Bytes.begin(), Bytes.end());

// Align end to 32-bits.
while (GetBufferOffset() & 3)
Out.push_back(0);
Buffer.push_back(0);
}
void emitBlob(StringRef Bytes, bool ShouldEmitSize = true) {
emitBlob(ArrayRef((const uint8_t *)Bytes.data(), Bytes.size()),
Expand Down
2 changes: 1 addition & 1 deletion llvm/include/llvm/ProfileData/PGOCtxProfWriter.h
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ class PGOCtxProfileWriter final {
public:
PGOCtxProfileWriter(raw_fd_stream &Out,
std::optional<unsigned> VersionOverride = std::nullopt)
: Writer(Buff, &Out, 0) {
: Writer(Out, 0) {
Writer.EnterSubblock(PGOCtxProfileBlockIDs::ProfileMetadataBlockID,
CodeLen);
const auto Version = VersionOverride ? *VersionOverride : CurrentVersion;
Expand Down
10 changes: 9 additions & 1 deletion llvm/include/llvm/Support/raw_ostream.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,7 @@ class raw_ostream {
enum class OStreamKind {
OK_OStream,
OK_FDStream,
OK_SVecStream,
};

private:
Expand Down Expand Up @@ -703,7 +704,11 @@ class raw_svector_ostream : public raw_pwrite_stream {
///
/// \param O The vector to write to; this should generally have at least 128
/// bytes free to avoid any extraneous memory overhead.
explicit raw_svector_ostream(SmallVectorImpl<char> &O) : OS(O) {
explicit raw_svector_ostream(SmallVectorImpl<char> &O)
: raw_pwrite_stream(false, raw_ostream::OStreamKind::OK_SVecStream),
OS(O) {
// FIXME: here and in a few other places, set directly to unbuffered in the
// ctor.
SetUnbuffered();
}

Expand All @@ -713,10 +718,13 @@ class raw_svector_ostream : public raw_pwrite_stream {

/// Return a StringRef for the vector contents.
StringRef str() const { return StringRef(OS.data(), OS.size()); }
SmallVectorImpl<char> &buffer() { return OS; }

void reserveExtraSpace(uint64_t ExtraSize) override {
OS.reserve(tell() + ExtraSize);
}

static bool classof(const raw_ostream *OS);
};

/// A raw_ostream that discards all output.
Expand Down
Loading
Loading