Use std::u8string where appropriate

fmang · Mar 3, 2023 · 1d13c25 · 1d13c25
1 parent 89dc000
commit 1d13c25
Show file tree

Hide file tree

Showing 7 changed files with 123 additions and 122 deletions.
diff --git a/src/base64.cc b/src/base64.cc
@@ -13,23 +13,23 @@
 
 #include <cstring>
 
-static const char base64_table[65] =
-	"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+static const char8_t base64_table[65] =
+	u8"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
 
-std::string ot::encode_base64(ot::byte_string_view src)
+std::u8string ot::encode_base64(ot::byte_string_view src)
 {
 	size_t len = src.size();
 	size_t num_blocks = (len + 2) / 3; // Count of 3-byte blocks, rounded up.
 	size_t olen = num_blocks * 4; // Each 3-byte block becomes 4 base64 bytes.
 	if (olen < len)
 		throw std::overflow_error("failed to encode excessively long base64 block");
 
-	std::string out;
+	std::u8string out;
 	out.resize(olen);
 
 	const uint8_t* in = src.data();
 	const uint8_t* end = in + len;
-	char* pos = out.data();
+	char8_t* pos = out.data();
 	while (end - in >= 3) {
 		*pos++ = base64_table[in[0] >> 2];
 		*pos++ = base64_table[((in[0] & 0x03) << 4) | (in[1] >> 4)];
@@ -53,10 +53,10 @@ std::string ot::encode_base64(ot::byte_string_view src)
 	return out;
 }
 
-ot::byte_string ot::decode_base64(std::string_view src)
+ot::byte_string ot::decode_base64(std::u8string_view src)
 {
 	// Remove the padding and rely on the string length instead.
-	while (src.back() == '=')
+	while (src.back() == u8'=')
 		src.remove_suffix(1);
 
 	size_t olen = src.size() / 4 * 3; // Whole blocks;

diff --git a/src/cli.cc b/src/cli.cc
@@ -65,6 +65,8 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 	options opt;
 	const char* equal;
 	ot::status rc;
+	std::list<std::string> local_to_add; // opt.to_add before UTF-8 conversion.
+	std::list<std::string> local_to_delete; // opt.to_delete before UTF-8 conversion.
 	bool set_all = false;
 	std::optional<std::string> set_cover;
 	opt = {};
@@ -90,16 +92,16 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 			opt.overwrite = true;
 			break;
 		case 'd':
-			opt.to_delete.emplace_back(optarg);
+			local_to_delete.emplace_back(optarg);
 			break;
 		case 'a':
 		case 's':
 			equal = strchr(optarg, '=');
 			if (equal == nullptr)
 				throw status {st::bad_arguments, "Comment does not contain an equal sign: "s + optarg + "."};
 			if (c == 's')
-				opt.to_delete.emplace_back(optarg, equal - optarg);
-			opt.to_add.emplace_back(optarg);
+				local_to_delete.emplace_back(optarg, equal - optarg);
+			local_to_add.emplace_back(optarg);
 			break;
 		case 'S':
 			opt.delete_all = true;
@@ -151,14 +153,22 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 		throw status { st::bad_arguments, "Cannot use standard input more than once." };
 
 	// Convert arguments to UTF-8.
-	if (!opt.raw) {
-		for (std::list<std::string>* args : { &opt.to_add, &opt.to_delete }) {
-			try {
-				for (std::string& arg : *args)
-					arg = to_utf8(arg);
-			} catch (const ot::status& rc) {
-				throw status {st::bad_arguments, "Could not encode argument into UTF-8: " + rc.message};
-			}
+	if (opt.raw) {
+		// Cast the user data without any encoding conversion.
+		auto cast_to_utf8 = [](std::string_view in)
+			{ return std::u8string(reinterpret_cast<const char8_t*>(in.data()), in.size()); };
+		std::transform(local_to_add.begin(), local_to_add.end(),
+			       std::back_inserter(opt.to_add), cast_to_utf8);
+		std::transform(local_to_delete.begin(), local_to_delete.end(),
+			       std::back_inserter(opt.to_delete), cast_to_utf8);
+	} else {
+		try {
+			std::transform(local_to_add.begin(), local_to_add.end(),
+			               std::back_inserter(opt.to_add), encode_utf8);
+			std::transform(local_to_delete.begin(), local_to_delete.end(),
+			               std::back_inserter(opt.to_delete), encode_utf8);
+		} catch (const ot::status& rc) {
+			throw status {st::bad_arguments, "Could not encode argument into UTF-8: " + rc.message};
 		}
 	}
 
@@ -188,35 +198,35 @@ ot::options ot::parse_options(int argc, char** argv, FILE* comments_input)
 
 	if (set_cover) {
 		byte_string picture_data = ot::slurp_binary_file(set_cover->c_str());
-		opt.to_delete.push_back("METADATA_BLOCK_PICTURE");
+		opt.to_delete.push_back(u8"METADATA_BLOCK_PICTURE"s);
 		opt.to_add.push_back(ot::make_cover(picture_data));
 	}
 
 	if (set_all) {
 		// Read comments from stdin and prepend them to opt.to_add.
-		std::list<std::string> comments = read_comments(comments_input, opt.raw);
+		std::list<std::u8string> comments = read_comments(comments_input, opt.raw);
 		opt.to_add.splice(opt.to_add.begin(), std::move(comments));
 	}
 	return opt;
 }
 
 /** Format a UTF-8 string by adding tabulations (\t) after line feeds (\n) to mark continuation for
  *  multiline values. */
-static std::string format_value(const std::string& source)
+static std::u8string format_value(const std::u8string& source)
 {
-	auto newline_count = std::count(source.begin(), source.end(), '\n');
+	auto newline_count = std::count(source.begin(), source.end(), u8'\n');
 
 	// General case: the value fits on a single line. Use std::string’s copy constructor for the
 	// most efficient copy we could hope for.
 	if (newline_count == 0)
 		return source;
 
-	std::string formatted;
+	std::u8string formatted;
 	formatted.reserve(source.size() + newline_count);
 	for (auto c : source) {
 		formatted.push_back(c);
 		if (c == '\n')
-			formatted.push_back('\t');
+			formatted.push_back(u8'\t');
 	}
 	return formatted;
 }
@@ -227,11 +237,10 @@ static std::string format_value(const std::string& source)
  * To disambiguate between a newline embedded in a comment and a newline representing the start of
  * the next tag, continuation lines always have a single TAB (^I) character added to the beginning.
  */
-void ot::print_comments(const std::list<std::string>& comments, FILE* output, bool raw)
+void ot::print_comments(const std::list<std::u8string>& comments, FILE* output, bool raw)
 {
-	std::string local;
 	bool has_control = false;
-	for (const std::string& source_comment : comments) {
+	for (const std::u8string& source_comment : comments) {
 		if (!has_control) { // Don’t bother analyzing comments if the flag is already up.
 			for (unsigned char c : source_comment) {
 				if (c < 0x20 && c != '\n') {
@@ -241,46 +250,43 @@ void ot::print_comments(const std::list<std::string>& comments, FILE* output, bo
 			}
 		}
 
-		std::string utf8_comment = format_value(source_comment);
-		const std::string* comment;
+		std::u8string utf8_comment = format_value(source_comment);
 		// Convert the comment from UTF-8 to the system encoding if relevant.
 		if (raw) {
-			comment = &utf8_comment;
+			fwrite(utf8_comment.data(), 1, utf8_comment.size(), output);
 		} else {
 			try {
-				local = from_utf8(utf8_comment);
-				comment = &local;
+				std::string local = decode_utf8(utf8_comment);
+				fwrite(local.data(), 1, local.size(), output);
 			} catch (ot::status& rc) {
 				rc.message += " See --raw.";
 				throw;
 			}
 		}
-
-		fwrite(comment->data(), 1, comment->size(), output);
 		putc('\n', output);
 	}
 	if (has_control)
 		fputs("warning: Some tags contain control characters.\n", stderr);
 }
 
-std::list<std::string> ot::read_comments(FILE* input, bool raw)
+std::list<std::u8string> ot::read_comments(FILE* input, bool raw)
 {
-	std::list<std::string> comments;
+	std::list<std::u8string> comments;
 	comments.clear();
 	char* source_line = nullptr;
 	size_t buflen = 0;
 	ssize_t nread;
-	std::string* previous_comment = nullptr;
+	std::u8string* previous_comment = nullptr;
 	while ((nread = getline(&source_line, &buflen, input)) != -1) {
 		if (nread > 0 && source_line[nread - 1] == '\n')
 			--nread; // Chomp.
 
-		std::string line;
+		std::u8string line;
 		if (raw) {
-			line = std::string(source_line, nread);
+			line = std::u8string(reinterpret_cast<char8_t*>(source_line), nread);
 		} else {
 			try {
-				line = to_utf8(std::string_view(source_line, nread));
+				line = encode_utf8(std::string_view(source_line, nread));
 			} catch (const ot::status& rc) {
 				free(source_line);
 				throw ot::status {ot::st::badly_encoded, "UTF-8 conversion error: " + rc.message};
@@ -290,10 +296,10 @@ std::list<std::string> ot::read_comments(FILE* input, bool raw)
 		if (line.empty()) {
 			// Ignore empty lines.
 			previous_comment = nullptr;
-		} else if (line[0] == '#') {
+		} else if (line[0] == u8'#') {
 			// Ignore comments.
 			previous_comment = nullptr;
-		} else if (line[0] == '\t') {
+		} else if (line[0] == u8'\t') {
 			// Continuation line: append the current line to the previous tag.
 			if (previous_comment == nullptr) {
 				ot::status rc = {ot::st::error, "Unexpected continuation line: " + std::string(source_line, nread)};
@@ -303,7 +309,7 @@ std::list<std::string> ot::read_comments(FILE* input, bool raw)
 				line[0] = '\n';
 				previous_comment->append(line);
 			}
-		} else if (line.find('=') == std::string::npos) {
+		} else if (line.find(u8'=') == decltype(line)::npos) {
 			ot::status rc = {ot::st::error, "Malformed tag: " + std::string(source_line, nread)};
 			free(source_line);
 			throw rc;
@@ -315,19 +321,20 @@ std::list<std::string> ot::read_comments(FILE* input, bool raw)
 	return comments;
 }
 
-void ot::delete_comments(std::list<std::string>& comments, const std::string& selector)
+void ot::delete_comments(std::list<std::u8string>& comments, const std::u8string& selector)
 {
 	auto name = selector.data();
-	auto equal = selector.find('=');
-	auto value = (equal == std::string::npos ? nullptr : name + equal + 1);
+	auto equal = selector.find(u8'=');
+	auto value = (equal == std::u8string::npos ? nullptr : name + equal + 1);
 	auto name_len = value ? equal : selector.size();
 	auto value_len = value ? selector.size() - equal - 1 : 0;
 	auto it = comments.begin(), end = comments.end();
 	while (it != end) {
 		auto current = it++;
+		/** \todo Avoid using strncasecmp because it assumes the system locale is UTF-8. */
 		bool name_match = current->size() > name_len + 1 &&
 		                  (*current)[name_len] == '=' &&
-		                  strncasecmp(current->data(), name, name_len) == 0;
+		                  strncasecmp((const char*) current->data(), (const char*) name, name_len) == 0;
 		if (!name_match)
 			continue;
 		bool value_match = value == nullptr ||
@@ -343,11 +350,11 @@ static void edit_tags(ot::opus_tags& tags, const ot::options& opt)
 {
 	if (opt.delete_all) {
 		tags.comments.clear();
-	} else for (const std::string& name : opt.to_delete) {
-		ot::delete_comments(tags.comments, name.c_str());
+	} else for (const std::u8string& name : opt.to_delete) {
+		ot::delete_comments(tags.comments, name);
 	}
 
-	for (const std::string& comment : opt.to_add)
+	for (const std::u8string& comment : opt.to_add)
 		tags.comments.emplace_back(comment);
 }
 

diff --git a/src/opus.cc b/src/opus.cc
@@ -30,14 +30,14 @@ ot::opus_tags ot::parse_tags(const ogg_packet& packet)
 	if (packet.bytes < 0)
 		throw status {st::int_overflow, "Overflowing comment header length"};
 	size_t size = static_cast<size_t>(packet.bytes);
-	const char* data = reinterpret_cast<char*>(packet.packet);
+	const uint8_t* data = reinterpret_cast<uint8_t*>(packet.packet);
 	size_t pos = 0;
 	opus_tags my_tags;
 
 	// Magic number
 	if (8 > size)
 		throw status {st::cut_magic_number, "Comment header too short for the magic number"};
-	if (memcmp(data, "OpusTags", 8) != 0)
+	if (memcmp(data, u8"OpusTags", 8) != 0)
 		throw status {st::bad_magic_number, "Comment header did not start with OpusTags"};
 
 	// Vendor
@@ -48,7 +48,7 @@ ot::opus_tags ot::parse_tags(const ogg_packet& packet)
 	size_t vendor_length = le32toh(*((uint32_t*) (data + pos)));
 	if (pos + 4 + vendor_length > size)
 		throw status {st::cut_vendor_data, "Vendor string did not fit the comment header"};
-	my_tags.vendor = std::string(data + pos + 4, vendor_length);
+	my_tags.vendor = std::u8string(reinterpret_cast<const char8_t*>(&data[pos + 4]), vendor_length);
 	pos += 4 + my_tags.vendor.size();
 
 	// Comment count
@@ -66,21 +66,21 @@ ot::opus_tags ot::parse_tags(const ogg_packet& packet)
 		if (pos + 4 + comment_length > size)
 			throw status {st::cut_comment_data,
 			              "Comment string did not fit the comment header"};
-		const char *comment_value = data + pos + 4;
+		auto comment_value = reinterpret_cast<const char8_t*>(&data[pos + 4]);
 		my_tags.comments.emplace_back(comment_value, comment_length);
 		pos += 4 + comment_length;
 	}
 
 	// Extra data
-	my_tags.extra_data = std::string(data + pos, size - pos);
+	my_tags.extra_data = byte_string(data + pos, size - pos);
 
 	return my_tags;
 }
 
 ot::dynamic_ogg_packet ot::render_tags(const opus_tags& tags)
 {
 	size_t size = 8 + 4 + tags.vendor.size() + 4;
-	for (const std::string& comment : tags.comments)
+	for (const std::u8string& comment : tags.comments)
 		size += 4 + comment.size();
 	size += tags.extra_data.size();
 
@@ -100,7 +100,7 @@ ot::dynamic_ogg_packet ot::render_tags(const opus_tags& tags)
 	n = htole32(tags.comments.size());
 	memcpy(data, &n, 4);
 	data += 4;
-	for (const std::string& comment : tags.comments) {
+	for (const std::u8string& comment : tags.comments) {
 		n = htole32(comment.size());
 		memcpy(data, &n, 4);
 		memcpy(data+4, comment.data(), comment.size());
@@ -166,8 +166,8 @@ ot::byte_string ot::picture::serialize() const
  */
 std::optional<ot::picture> ot::extract_cover(const ot::opus_tags& tags)
 {
-	static const std::string_view prefix = "METADATA_BLOCK_PICTURE="sv;
-	auto is_cover = [](const std::string& tag) { return tag.starts_with(prefix); };
+	static const std::u8string_view prefix = u8"METADATA_BLOCK_PICTURE="sv;
+	auto is_cover = [](const std::u8string& tag) { return tag.starts_with(prefix); };
 	auto cover_tag = std::find_if(tags.comments.begin(), tags.comments.end(), is_cover);
 	if (cover_tag == tags.comments.end())
 		return {}; // No cover art.
@@ -177,7 +177,7 @@ std::optional<ot::picture> ot::extract_cover(const ot::opus_tags& tags)
 		fputs("warning: Found multiple covers; only the first will be extracted."
 		              " Please report your use case if you need a finer selection.\n", stderr);
 
-	std::string_view cover_value = *cover_tag;
+	std::u8string_view cover_value = *cover_tag;
 	cover_value.remove_prefix(prefix.size());
 	return picture(decode_base64(cover_value));
 }
@@ -202,10 +202,10 @@ static ot::byte_string_view detect_mime_type(ot::byte_string_view data)
 	return "application/octet-stream"_bsv;
 }
 
-std::string ot::make_cover(ot::byte_string_view picture_data)
+std::u8string ot::make_cover(ot::byte_string_view picture_data)
 {
 	picture pic;
 	pic.mime_type = detect_mime_type(picture_data);
 	pic.picture_data = picture_data;
-	return "METADATA_BLOCK_PICTURE=" + encode_base64(pic.serialize());
+	return u8"METADATA_BLOCK_PICTURE=" + encode_base64(pic.serialize());
 }