From 627a0e501208b5d22b50358e36cdf2eff1612dd7 Mon Sep 17 00:00:00 2001 From: Johannes Meyer zum Alten Borgloh Date: Sat, 8 Dec 2018 21:23:25 +0100 Subject: [PATCH] Improves Tumblr video detection - Adds tumblr specific video regexes to the tumblr search and tumblr tag search crawler - Fixes tumblr '/video_file/' regex --- .../Crawler/AbstractTumblrCrawler.cs | 13 ++++++++++ .../Crawler/TumblrBlogCrawler.cs | 24 +++---------------- .../Crawler/TumblrHiddenCrawler.cs | 24 +++---------------- .../Crawler/TumblrSearchCrawler.cs | 2 ++ .../Crawler/TumblrTagSearchCrawler.cs | 2 ++ .../Parser/TumblrParser.cs | 2 +- 6 files changed, 24 insertions(+), 43 deletions(-) diff --git a/src/TumblThree/TumblThree.Applications/Crawler/AbstractTumblrCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/AbstractTumblrCrawler.cs index 317e9ce..4638a94 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/AbstractTumblrCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/AbstractTumblrCrawler.cs @@ -248,6 +248,19 @@ protected void AddTumblrVideoUrl(string post) } } + protected void AddInlineTumblrVideoUrl(string post, Regex regex) + { + foreach (Match match in regex.Matches(post)) + { + string videoUrl = match.Groups[1].Value; + + if (shellService.Settings.VideoSize == 480) + videoUrl += "_480"; + + AddToDownloadList(new VideoPost(videoUrl + ".mp4", Guid.NewGuid().ToString("N"))); + } + } + protected void AddGenericPhotoUrl(string post) { foreach (string imageUrl in tumblrParser.SearchForGenericPhotoUrl(post)) diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs index e57f05e..0627379 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrBlogCrawler.cs @@ -461,9 +461,9 @@ private void AddVideoUrlToDownloadList(Post post) //var videoUrls = new HashSet(); - AddInlineVideoUrl(postCopy); - AddInlineTumblrVideoUrl(postCopy, new Regex("\"(https?://ve.media.tumblr.com/(tumblr_[\\w]*))")); - AddInlineTumblrVideoUrl(postCopy, new Regex("\"(https?://vtt.tumblr.com/(tumblr_[\\w]*))")); + AddTumblrVideoUrl(InlineSearch(postCopy)); + AddInlineTumblrVideoUrl(InlineSearch(postCopy), tumblrParser.GetTumblrVeVideoUrlRegex()); + AddInlineTumblrVideoUrl(InlineSearch(postCopy), tumblrParser.GetTumblrVttVideoUrlRegex()); if (blog.RegExVideos) AddGenericInlineVideoUrl(postCopy); @@ -490,24 +490,6 @@ private void AddVideoUrl(Post post) AddToJsonQueue(new TumblrCrawlerData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); } - private void AddInlineVideoUrl(Post post) - { - AddTumblrVideoUrl(InlineSearch(post)); - } - - private void AddInlineTumblrVideoUrl(Post post, Regex regex) - { - foreach (Match match in regex.Matches(InlineSearch(post))) - { - string videoUrl = match.Groups[1].Value; - - if (shellService.Settings.VideoSize == 480) - videoUrl += "_480"; - - AddToDownloadList(new VideoPost(videoUrl + ".mp4", Guid.NewGuid().ToString("N"))); - } - } - private void AddGenericInlineVideoUrl(Post post) { AddGenericVideoUrl(InlineSearch(post)); diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrHiddenCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrHiddenCrawler.cs index 8313000..d2bae33 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrHiddenCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrHiddenCrawler.cs @@ -461,9 +461,9 @@ private void AddVideoUrlToDownloadList(Post post) //var videoUrls = new HashSet(); - AddInlineVideoUrl(postCopy); - AddInlineTumblrVideoUrl(postCopy, new Regex("\"(https?://ve.media.tumblr.com/(tumblr_[\\w]*))")); - AddInlineTumblrVideoUrl(postCopy, new Regex("\"(https?://vtt.tumblr.com/(tumblr_[\\w]*))")); + AddTumblrVideoUrl(InlineSearch(postCopy)); + AddInlineTumblrVideoUrl(InlineSearch(postCopy), tumblrParser.GetTumblrVeVideoUrlRegex()); + AddInlineTumblrVideoUrl(InlineSearch(postCopy), tumblrParser.GetTumblrVttVideoUrlRegex()); if (blog.RegExVideos) AddGenericInlineVideoUrl(postCopy); @@ -490,24 +490,6 @@ private void AddVideoUrl(Post post) AddToJsonQueue(new TumblrCrawlerData(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post)); } - private void AddInlineVideoUrl(Post post) - { - AddTumblrVideoUrl(InlineSearch(post)); - } - - private void AddInlineTumblrVideoUrl(Post post, Regex regex) - { - foreach (Match match in regex.Matches(InlineSearch(post))) - { - string videoUrl = match.Groups[1].Value; - - if (shellService.Settings.VideoSize == 480) - videoUrl += "_480"; - - AddToDownloadList(new VideoPost(videoUrl + ".mp4", Guid.NewGuid().ToString("N"))); - } - } - private void AddGenericInlineVideoUrl(Post post) { AddGenericVideoUrl(InlineSearch(post)); diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs index 60b3d60..915986f 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrSearchCrawler.cs @@ -201,6 +201,8 @@ private void AddVideoUrlToDownloadList(string document) if (!blog.DownloadVideo) return; AddTumblrVideoUrl(document); + AddInlineTumblrVideoUrl(document, tumblrParser.GetTumblrVeVideoUrlRegex()); + AddInlineTumblrVideoUrl(document, tumblrParser.GetTumblrVttVideoUrlRegex()); if (blog.RegExVideos) AddGenericVideoUrl(document); diff --git a/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs b/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs index 5932cad..e2f4481 100644 --- a/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs +++ b/src/TumblThree/TumblThree.Applications/Crawler/TumblrTagSearchCrawler.cs @@ -238,6 +238,8 @@ private void AddVideoUrlToDownloadList(string document) if (!blog.DownloadVideo) return; AddTumblrVideoUrl(document); + AddInlineTumblrVideoUrl(document, tumblrParser.GetTumblrVeVideoUrlRegex()); + AddInlineTumblrVideoUrl(document, tumblrParser.GetTumblrVttVideoUrlRegex()); if (blog.RegExVideos) AddGenericVideoUrl(document); diff --git a/src/TumblThree/TumblThree.Applications/Parser/TumblrParser.cs b/src/TumblThree/TumblThree.Applications/Parser/TumblrParser.cs index e359b43..3fe2c30 100644 --- a/src/TumblThree/TumblThree.Applications/Parser/TumblrParser.cs +++ b/src/TumblThree/TumblThree.Applications/Parser/TumblrParser.cs @@ -13,7 +13,7 @@ public class TumblrParser : ITumblrParser public Regex GetTumblrVttVideoUrlRegex() => new Regex("\"(https?://vtt.tumblr.com/(tumblr_[\\w]*))"); - public Regex GetTumblrInlineVideoUrlRegex() => new Regex("\"(http[A-Za-z0-9_/:.]*.com/video_file/[A-Za-z0-9_/:.]*)\""); + public Regex GetTumblrInlineVideoUrlRegex() => new Regex("\"(http[A-Za-z0-9_/:.]*video_file[\\S]*/(tumblr_[\\w]*))[0-9/]*\""); public Regex GetGenericVideoUrlRegex() => new Regex("\"(https?://(?:[a-z0-9\\-]+\\.)+[a-z]{2,6}(?:/[^/#?]+)+\\.(?:mp4|mkv|gifv))\"");