Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions src/BloomExe/Publish/BloomPub/BloomPubMaker.cs
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,9 @@ public static string CreateBloomPub(
.Cast<SafeXmlElement>(),
modifiedBook.FolderPath
);
// BL-15848 Deduplication to save space. After publish-time media transforms so we compare the actual bloompub payload,
// not source files that may still be resized, trimmed, or rewritten later in the pipeline.
PublishHelper.DeDuplicateMediaFiles(modifiedBook.RawDom, modifiedBook.FolderPath);
Comment on lines +173 to +175
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

🚩 DeDuplicateMediaFiles only in BloomPub path, not ePub

DeDuplicateMediaFiles is called in CreateBloomPub (BloomPubMaker.cs:175), but the ePub path (EpubMaker.cs) only gets DeDuplicateGifs via MakeDeviceXmatterTempBook. This means ePub publishing doesn't benefit from full media deduplication (images, videos, non-talking audio). This may be intentional — ePub has different packaging requirements — but it's worth confirming whether ePub output could also benefit from this optimization.

Open in Devin Review

Was this helpful? React with 👍 or 👎 to provide feedback.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The card did not include epubs

var newContent = XmlHtmlConverter.ConvertDomToHtml5(modifiedBook.RawDom);

var originalBookHtmlPath = BookStorage.FindBookHtmlInFolder(modifiedBook.FolderPath);
Expand Down
309 changes: 309 additions & 0 deletions src/BloomExe/Publish/PublishHelper.cs
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
using System.Drawing;
using System.IO;
using System.Linq;
using System.Security.Cryptography;
using System.Text;
using System.Threading;
using System.Threading.Tasks;
Expand Down Expand Up @@ -1084,6 +1085,314 @@ public static Book.Book MakeDeviceXmatterTempBook(
return modifiedBook;
}

private sealed class MediaReference
{
public string RelativePath;
public Action<string> RewriteReference;
}

internal static void DeDuplicateMediaFiles(SafeXmlDocument dom, string folderPath)
{
DeDuplicateReferencedMedia(GetImageMediaReferences(dom), folderPath);
DeDuplicateReferencedMedia(GetVideoMediaReferences(dom), folderPath);
// Narration files are tied to specific spans, so keep those one-to-one file names stable.
var talkingBookAudioFileNames = GetTalkingBookAudioFileNames(dom);
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Copilot made this talkingBookAudioFileNames blacklist and it's generally redundant, since in GetNonTalkingAudioMediaReferences() we don't pick up TBT recordings anyway. But I guess it guards against the case where someone records TBT audio and then selects that same audio file (from within the book folder) as music within the same book multiple times. But that sounds to me like a diabolical corner case... should I remove this blacklist?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I don't know that the blacklist hurts anything.

DeDuplicateReferencedMedia(
GetNonTalkingAudioMediaReferences(dom, talkingBookAudioFileNames),
folderPath
);
}

private static void DeDuplicateReferencedMedia(
IEnumerable<MediaReference> references,
string folderPath
)
{
var referencesByPath = new Dictionary<string, List<MediaReference>>();
var firstRelativePathByNormalizedPath = new Dictionary<string, string>();
var fullPathByNormalizedPath = new Dictionary<string, string>();
var normalizedPathsInEncounterOrder = new List<string>();

foreach (var reference in references)
{
if (string.IsNullOrWhiteSpace(reference.RelativePath))
continue;

var relativePath = NormalizeSlashes(reference.RelativePath);
var caseNormalizedRelativePath = BookStorage.GetNormalizedPathForOS(relativePath); // for comparison
if (!referencesByPath.TryGetValue(caseNormalizedRelativePath, out var refsForPath))
{
refsForPath = new List<MediaReference>();
referencesByPath[caseNormalizedRelativePath] = refsForPath;
firstRelativePathByNormalizedPath[caseNormalizedRelativePath] = relativePath;
fullPathByNormalizedPath[caseNormalizedRelativePath] = ResolveMediaFilePath(
folderPath,
relativePath
);
normalizedPathsInEncounterOrder.Add(caseNormalizedRelativePath);
}

refsForPath.Add(reference);
}

var hashToCanonicalRelativePath = new Dictionary<string, string>();
var normalizedPathsToDelete = new HashSet<string>();
foreach (var normalizedRelativePath in normalizedPathsInEncounterOrder)
{
var filePath = fullPathByNormalizedPath[normalizedRelativePath];
if (!RobustFile.Exists(filePath))
continue;

var hashString = ComputeFileHash(filePath);
if (
hashToCanonicalRelativePath.TryGetValue(
hashString,
out var canonicalRelativePath
)
)
{
// Rewrite all references before deleting any duplicate file so later references to
// the same duplicate path can still be resolved during this pass.
foreach (var reference in referencesByPath[normalizedRelativePath])
{
reference.RewriteReference(canonicalRelativePath);
}

normalizedPathsToDelete.Add(normalizedRelativePath);
}
else
{
hashToCanonicalRelativePath[hashString] = firstRelativePathByNormalizedPath[
normalizedRelativePath
];
}
}

foreach (var normalizedRelativePath in normalizedPathsToDelete)
{
RobustFile.Delete(fullPathByNormalizedPath[normalizedRelativePath]);
}
}

private static IEnumerable<MediaReference> GetImageMediaReferences(SafeXmlDocument dom)
{
foreach (
var imageElement in HtmlDom
.SelectChildImgAndBackgroundImageElements(dom.DocumentElement)
.Cast<SafeXmlElement>()
)
{
var relativePath = HtmlDom.GetImageElementUrl(imageElement).PathOnly.NotEncoded;
if (
string.IsNullOrWhiteSpace(relativePath)
|| ImageUtils.IsPlaceholderImageFilename(relativePath)
)
{
continue;
}

yield return new MediaReference
{
RelativePath = relativePath,
RewriteReference = canonicalRelativePath =>
HtmlDom.SetImageElementUrl(
imageElement,
UrlPathString.CreateFromUnencodedString(canonicalRelativePath)
),
};
}

foreach (
var bookSetting in dom.SafeSelectNodes(
"//div[@id='bloomDataDiv']/div[@data-book='coverImage' or @data-book='licenseImage']"
)
.Cast<SafeXmlElement>()
)
{
var relativePath = UrlPathString
.CreateFromUrlEncodedString(bookSetting.InnerText.Trim())
.PathOnly.NotEncoded;
if (
string.IsNullOrWhiteSpace(relativePath)
|| ImageUtils.IsPlaceholderImageFilename(relativePath)
)
{
continue;
}

yield return new MediaReference
{
RelativePath = relativePath,
RewriteReference = canonicalRelativePath =>
bookSetting.InnerText = canonicalRelativePath,
};
}
}

private static IEnumerable<MediaReference> GetVideoMediaReferences(SafeXmlDocument dom)
{
foreach (
var videoContainer in HtmlDom
.SelectChildVideoElements(dom.DocumentElement)
.Cast<SafeXmlElement>()
)
{
var relativePath = HtmlDom.GetVideoElementUrl(videoContainer).PathOnly.NotEncoded;
if (string.IsNullOrWhiteSpace(relativePath))
continue;

yield return new MediaReference
{
RelativePath = relativePath,
RewriteReference = canonicalRelativePath =>
HtmlDom.SetVideoElementUrl(
videoContainer,
UrlPathString.CreateFromUnencodedString(canonicalRelativePath)
),
};
}
}

private static IEnumerable<MediaReference> GetNonTalkingAudioMediaReferences(
SafeXmlDocument dom,
HashSet<string> talkingBookAudioFileNames
)
{
foreach (
var pageWithBackgroundMusic in HtmlDom
.SelectChildBackgroundMusicElements(dom.DocumentElement)
.Cast<SafeXmlElement>()
)
{
var reference = MakeAudioAttributeReference(
pageWithBackgroundMusic,
HtmlDom.musicAttrName,
talkingBookAudioFileNames
);
if (reference != null)
yield return reference;
}

foreach (
var soundElement in dom.SafeSelectNodes(".//*[@data-sound]").Cast<SafeXmlElement>()
)
{
var reference = MakeAudioAttributeReference(
soundElement,
"data-sound",
talkingBookAudioFileNames
);
if (reference != null)
yield return reference;
}

foreach (
var elementWithCorrectSound in dom.SafeSelectNodes(".//*[@data-correct-sound]")
Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think 6.4 reuses correct-sound and wrong-sound files but 6.3 doesn't?

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

6.3 also shared the correct and wrong sound files from the built-in list. Users can choose their own sound files, and those aren't shared. This behavior has not changed in 6.4.

.Cast<SafeXmlElement>()
)
{
var reference = MakeAudioAttributeReference(
elementWithCorrectSound,
"data-correct-sound",
talkingBookAudioFileNames
);
if (reference != null)
yield return reference;
}

foreach (
var elementWithWrongSound in dom.SafeSelectNodes(".//*[@data-wrong-sound]")
.Cast<SafeXmlElement>()
)
{
var reference = MakeAudioAttributeReference(
elementWithWrongSound,
"data-wrong-sound",
talkingBookAudioFileNames
);
if (reference != null)
yield return reference;
}
}

private static MediaReference MakeAudioAttributeReference(
SafeXmlElement element,
string attributeName,
HashSet<string> talkingBookAudioFileNames
)
{
var rawValue = element.GetAttribute(attributeName);
if (string.IsNullOrWhiteSpace(rawValue) || rawValue == "none")
return null;

var fileName = UrlPathString.CreateFromUrlEncodedString(rawValue).PathOnly.NotEncoded;
var normalizedFileName = BookStorage.GetNormalizedPathForOS(fileName);
if (talkingBookAudioFileNames.Contains(normalizedFileName))
return null;

return new MediaReference
{
RelativePath = MakeRelativePath("audio", fileName),
RewriteReference = canonicalRelativePath =>
element.SetAttribute(attributeName, Path.GetFileName(canonicalRelativePath)),
};
}

private static HashSet<string> GetTalkingBookAudioFileNames(SafeXmlDocument dom)
{
// Match Bloom's narration selector so we skip exactly the files publish already treats as
// talking-book audio, including split TextBox recordings.
var fileNames = new HashSet<string>();
foreach (
var narrationElement in HtmlDom
.SelectChildNarrationAudioElements(
dom.DocumentElement,
includeSplitTextBoxAudio: true,
langsToExclude: null
)
.Cast<SafeXmlElement>()
)
{
var narrationId = narrationElement.GetOptionalStringAttribute("id", null);
if (string.IsNullOrWhiteSpace(narrationId))
continue;

foreach (var fileName in BookStorage.GetNarrationAudioFileNames(narrationId, true))
{
fileNames.Add(BookStorage.GetNormalizedPathForOS(fileName));
}
}

return fileNames;
}

private static string ComputeFileHash(string filePath)
{
using var stream = RobustFile.OpenRead(filePath);
return Convert.ToHexString(SHA256.HashData(stream));
}

private static string ResolveMediaFilePath(string folderPath, string relativePath)
{
var path = relativePath.Replace('/', Path.DirectorySeparatorChar);
return BookStorage.GetNormalizedPathForOS(Path.Combine(folderPath, path));
}

private static string NormalizeSlashes(string relativePath)
{
return relativePath.Replace('\\', '/');
}

private static string MakeRelativePath(params string[] parts)
{
return string.Join(
"/",
parts
.Where(part => !string.IsNullOrWhiteSpace(part))
.Select(part => part.Trim('/', '\\'))
);
}

/// <summary>
/// Once we have really cropped any images that need it, we no longer need the "canvas element-like"
/// HTML structure that supports cropping background images. So we get rid of the extra structure
Expand Down
Loading