-
-
Notifications
You must be signed in to change notification settings - Fork 18
BL 15848 DeDuplicateMediaFiles #7814
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -4,6 +4,7 @@ | |
| using System.Drawing; | ||
| using System.IO; | ||
| using System.Linq; | ||
| using System.Security.Cryptography; | ||
| using System.Text; | ||
| using System.Threading; | ||
| using System.Threading.Tasks; | ||
|
|
@@ -1084,6 +1085,314 @@ public static Book.Book MakeDeviceXmatterTempBook( | |
| return modifiedBook; | ||
| } | ||
|
|
||
| private sealed class MediaReference | ||
| { | ||
| public string RelativePath; | ||
| public Action<string> RewriteReference; | ||
| } | ||
|
|
||
| internal static void DeDuplicateMediaFiles(SafeXmlDocument dom, string folderPath) | ||
| { | ||
| DeDuplicateReferencedMedia(GetImageMediaReferences(dom), folderPath); | ||
| DeDuplicateReferencedMedia(GetVideoMediaReferences(dom), folderPath); | ||
| // Narration files are tied to specific spans, so keep those one-to-one file names stable. | ||
| var talkingBookAudioFileNames = GetTalkingBookAudioFileNames(dom); | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Copilot made this talkingBookAudioFileNames blacklist and it's generally redundant, since in GetNonTalkingAudioMediaReferences() we don't pick up TBT recordings anyway. But I guess it guards against the case where someone records TBT audio and then selects that same audio file (from within the book folder) as music within the same book multiple times. But that sounds to me like a diabolical corner case... should I remove this blacklist?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I don't know that the blacklist hurts anything. |
||
| DeDuplicateReferencedMedia( | ||
| GetNonTalkingAudioMediaReferences(dom, talkingBookAudioFileNames), | ||
| folderPath | ||
| ); | ||
| } | ||
|
|
||
| private static void DeDuplicateReferencedMedia( | ||
| IEnumerable<MediaReference> references, | ||
| string folderPath | ||
| ) | ||
| { | ||
| var referencesByPath = new Dictionary<string, List<MediaReference>>(); | ||
| var firstRelativePathByNormalizedPath = new Dictionary<string, string>(); | ||
| var fullPathByNormalizedPath = new Dictionary<string, string>(); | ||
| var normalizedPathsInEncounterOrder = new List<string>(); | ||
|
|
||
| foreach (var reference in references) | ||
| { | ||
| if (string.IsNullOrWhiteSpace(reference.RelativePath)) | ||
| continue; | ||
|
|
||
| var relativePath = NormalizeSlashes(reference.RelativePath); | ||
| var caseNormalizedRelativePath = BookStorage.GetNormalizedPathForOS(relativePath); // for comparison | ||
| if (!referencesByPath.TryGetValue(caseNormalizedRelativePath, out var refsForPath)) | ||
| { | ||
| refsForPath = new List<MediaReference>(); | ||
| referencesByPath[caseNormalizedRelativePath] = refsForPath; | ||
| firstRelativePathByNormalizedPath[caseNormalizedRelativePath] = relativePath; | ||
| fullPathByNormalizedPath[caseNormalizedRelativePath] = ResolveMediaFilePath( | ||
| folderPath, | ||
| relativePath | ||
| ); | ||
| normalizedPathsInEncounterOrder.Add(caseNormalizedRelativePath); | ||
| } | ||
|
|
||
| refsForPath.Add(reference); | ||
| } | ||
|
|
||
| var hashToCanonicalRelativePath = new Dictionary<string, string>(); | ||
| var normalizedPathsToDelete = new HashSet<string>(); | ||
| foreach (var normalizedRelativePath in normalizedPathsInEncounterOrder) | ||
| { | ||
| var filePath = fullPathByNormalizedPath[normalizedRelativePath]; | ||
| if (!RobustFile.Exists(filePath)) | ||
| continue; | ||
|
|
||
| var hashString = ComputeFileHash(filePath); | ||
| if ( | ||
| hashToCanonicalRelativePath.TryGetValue( | ||
| hashString, | ||
| out var canonicalRelativePath | ||
| ) | ||
| ) | ||
| { | ||
| // Rewrite all references before deleting any duplicate file so later references to | ||
| // the same duplicate path can still be resolved during this pass. | ||
| foreach (var reference in referencesByPath[normalizedRelativePath]) | ||
| { | ||
| reference.RewriteReference(canonicalRelativePath); | ||
| } | ||
|
|
||
| normalizedPathsToDelete.Add(normalizedRelativePath); | ||
| } | ||
| else | ||
| { | ||
| hashToCanonicalRelativePath[hashString] = firstRelativePathByNormalizedPath[ | ||
| normalizedRelativePath | ||
| ]; | ||
| } | ||
| } | ||
|
|
||
| foreach (var normalizedRelativePath in normalizedPathsToDelete) | ||
| { | ||
| RobustFile.Delete(fullPathByNormalizedPath[normalizedRelativePath]); | ||
| } | ||
| } | ||
|
|
||
| private static IEnumerable<MediaReference> GetImageMediaReferences(SafeXmlDocument dom) | ||
| { | ||
| foreach ( | ||
| var imageElement in HtmlDom | ||
| .SelectChildImgAndBackgroundImageElements(dom.DocumentElement) | ||
| .Cast<SafeXmlElement>() | ||
| ) | ||
| { | ||
| var relativePath = HtmlDom.GetImageElementUrl(imageElement).PathOnly.NotEncoded; | ||
| if ( | ||
| string.IsNullOrWhiteSpace(relativePath) | ||
| || ImageUtils.IsPlaceholderImageFilename(relativePath) | ||
| ) | ||
| { | ||
| continue; | ||
| } | ||
|
|
||
| yield return new MediaReference | ||
| { | ||
| RelativePath = relativePath, | ||
| RewriteReference = canonicalRelativePath => | ||
| HtmlDom.SetImageElementUrl( | ||
| imageElement, | ||
| UrlPathString.CreateFromUnencodedString(canonicalRelativePath) | ||
| ), | ||
| }; | ||
| } | ||
|
|
||
| foreach ( | ||
| var bookSetting in dom.SafeSelectNodes( | ||
| "//div[@id='bloomDataDiv']/div[@data-book='coverImage' or @data-book='licenseImage']" | ||
| ) | ||
| .Cast<SafeXmlElement>() | ||
| ) | ||
| { | ||
| var relativePath = UrlPathString | ||
| .CreateFromUrlEncodedString(bookSetting.InnerText.Trim()) | ||
| .PathOnly.NotEncoded; | ||
| if ( | ||
| string.IsNullOrWhiteSpace(relativePath) | ||
| || ImageUtils.IsPlaceholderImageFilename(relativePath) | ||
| ) | ||
| { | ||
| continue; | ||
| } | ||
|
|
||
| yield return new MediaReference | ||
| { | ||
| RelativePath = relativePath, | ||
| RewriteReference = canonicalRelativePath => | ||
| bookSetting.InnerText = canonicalRelativePath, | ||
| }; | ||
| } | ||
| } | ||
|
|
||
| private static IEnumerable<MediaReference> GetVideoMediaReferences(SafeXmlDocument dom) | ||
| { | ||
| foreach ( | ||
| var videoContainer in HtmlDom | ||
| .SelectChildVideoElements(dom.DocumentElement) | ||
| .Cast<SafeXmlElement>() | ||
| ) | ||
| { | ||
| var relativePath = HtmlDom.GetVideoElementUrl(videoContainer).PathOnly.NotEncoded; | ||
| if (string.IsNullOrWhiteSpace(relativePath)) | ||
| continue; | ||
|
|
||
| yield return new MediaReference | ||
| { | ||
| RelativePath = relativePath, | ||
| RewriteReference = canonicalRelativePath => | ||
| HtmlDom.SetVideoElementUrl( | ||
| videoContainer, | ||
| UrlPathString.CreateFromUnencodedString(canonicalRelativePath) | ||
| ), | ||
| }; | ||
| } | ||
| } | ||
|
|
||
| private static IEnumerable<MediaReference> GetNonTalkingAudioMediaReferences( | ||
| SafeXmlDocument dom, | ||
| HashSet<string> talkingBookAudioFileNames | ||
| ) | ||
| { | ||
| foreach ( | ||
| var pageWithBackgroundMusic in HtmlDom | ||
| .SelectChildBackgroundMusicElements(dom.DocumentElement) | ||
| .Cast<SafeXmlElement>() | ||
| ) | ||
| { | ||
| var reference = MakeAudioAttributeReference( | ||
| pageWithBackgroundMusic, | ||
| HtmlDom.musicAttrName, | ||
| talkingBookAudioFileNames | ||
| ); | ||
| if (reference != null) | ||
| yield return reference; | ||
| } | ||
|
|
||
| foreach ( | ||
| var soundElement in dom.SafeSelectNodes(".//*[@data-sound]").Cast<SafeXmlElement>() | ||
| ) | ||
| { | ||
| var reference = MakeAudioAttributeReference( | ||
| soundElement, | ||
| "data-sound", | ||
| talkingBookAudioFileNames | ||
| ); | ||
| if (reference != null) | ||
| yield return reference; | ||
| } | ||
|
|
||
| foreach ( | ||
| var elementWithCorrectSound in dom.SafeSelectNodes(".//*[@data-correct-sound]") | ||
|
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think 6.4 reuses correct-sound and wrong-sound files but 6.3 doesn't?
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 6.3 also shared the correct and wrong sound files from the built-in list. Users can choose their own sound files, and those aren't shared. This behavior has not changed in 6.4. |
||
| .Cast<SafeXmlElement>() | ||
| ) | ||
| { | ||
| var reference = MakeAudioAttributeReference( | ||
| elementWithCorrectSound, | ||
| "data-correct-sound", | ||
| talkingBookAudioFileNames | ||
| ); | ||
| if (reference != null) | ||
| yield return reference; | ||
| } | ||
|
|
||
| foreach ( | ||
| var elementWithWrongSound in dom.SafeSelectNodes(".//*[@data-wrong-sound]") | ||
| .Cast<SafeXmlElement>() | ||
| ) | ||
| { | ||
| var reference = MakeAudioAttributeReference( | ||
| elementWithWrongSound, | ||
| "data-wrong-sound", | ||
| talkingBookAudioFileNames | ||
| ); | ||
| if (reference != null) | ||
| yield return reference; | ||
| } | ||
| } | ||
|
|
||
| private static MediaReference MakeAudioAttributeReference( | ||
| SafeXmlElement element, | ||
| string attributeName, | ||
| HashSet<string> talkingBookAudioFileNames | ||
| ) | ||
| { | ||
| var rawValue = element.GetAttribute(attributeName); | ||
| if (string.IsNullOrWhiteSpace(rawValue) || rawValue == "none") | ||
| return null; | ||
|
|
||
| var fileName = UrlPathString.CreateFromUrlEncodedString(rawValue).PathOnly.NotEncoded; | ||
| var normalizedFileName = BookStorage.GetNormalizedPathForOS(fileName); | ||
| if (talkingBookAudioFileNames.Contains(normalizedFileName)) | ||
| return null; | ||
|
|
||
| return new MediaReference | ||
| { | ||
| RelativePath = MakeRelativePath("audio", fileName), | ||
| RewriteReference = canonicalRelativePath => | ||
| element.SetAttribute(attributeName, Path.GetFileName(canonicalRelativePath)), | ||
| }; | ||
| } | ||
|
|
||
| private static HashSet<string> GetTalkingBookAudioFileNames(SafeXmlDocument dom) | ||
| { | ||
| // Match Bloom's narration selector so we skip exactly the files publish already treats as | ||
| // talking-book audio, including split TextBox recordings. | ||
| var fileNames = new HashSet<string>(); | ||
| foreach ( | ||
| var narrationElement in HtmlDom | ||
| .SelectChildNarrationAudioElements( | ||
| dom.DocumentElement, | ||
| includeSplitTextBoxAudio: true, | ||
| langsToExclude: null | ||
| ) | ||
| .Cast<SafeXmlElement>() | ||
| ) | ||
| { | ||
| var narrationId = narrationElement.GetOptionalStringAttribute("id", null); | ||
| if (string.IsNullOrWhiteSpace(narrationId)) | ||
| continue; | ||
|
|
||
| foreach (var fileName in BookStorage.GetNarrationAudioFileNames(narrationId, true)) | ||
| { | ||
| fileNames.Add(BookStorage.GetNormalizedPathForOS(fileName)); | ||
| } | ||
| } | ||
|
|
||
| return fileNames; | ||
| } | ||
|
|
||
| private static string ComputeFileHash(string filePath) | ||
| { | ||
| using var stream = RobustFile.OpenRead(filePath); | ||
| return Convert.ToHexString(SHA256.HashData(stream)); | ||
| } | ||
|
|
||
| private static string ResolveMediaFilePath(string folderPath, string relativePath) | ||
| { | ||
| var path = relativePath.Replace('/', Path.DirectorySeparatorChar); | ||
| return BookStorage.GetNormalizedPathForOS(Path.Combine(folderPath, path)); | ||
| } | ||
|
|
||
| private static string NormalizeSlashes(string relativePath) | ||
| { | ||
| return relativePath.Replace('\\', '/'); | ||
| } | ||
|
|
||
| private static string MakeRelativePath(params string[] parts) | ||
| { | ||
| return string.Join( | ||
| "/", | ||
| parts | ||
| .Where(part => !string.IsNullOrWhiteSpace(part)) | ||
| .Select(part => part.Trim('/', '\\')) | ||
| ); | ||
| } | ||
|
|
||
| /// <summary> | ||
| /// Once we have really cropped any images that need it, we no longer need the "canvas element-like" | ||
| /// HTML structure that supports cropping background images. So we get rid of the extra structure | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
🚩 DeDuplicateMediaFiles only in BloomPub path, not ePub
DeDuplicateMediaFilesis called inCreateBloomPub(BloomPubMaker.cs:175), but the ePub path (EpubMaker.cs) only getsDeDuplicateGifsviaMakeDeviceXmatterTempBook. This means ePub publishing doesn't benefit from full media deduplication (images, videos, non-talking audio). This may be intentional — ePub has different packaging requirements — but it's worth confirming whether ePub output could also benefit from this optimization.Was this helpful? React with 👍 or 👎 to provide feedback.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The card did not include epubs