Skip to content
This repository has been archived by the owner on Mar 9, 2021. It is now read-only.

Commit

Permalink
Improves Tumblr video detection
Browse files Browse the repository at this point in the history
- Adds tumblr specific video regexes to the tumblr search and tumblr tag search crawler
- Fixes tumblr '/video_file/' regex
  • Loading branch information
johanneszab committed Dec 8, 2018
1 parent cc34ea8 commit 627a0e5
Show file tree
Hide file tree
Showing 6 changed files with 24 additions and 43 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,19 @@ protected void AddTumblrVideoUrl(string post)
}
}

protected void AddInlineTumblrVideoUrl(string post, Regex regex)
{
foreach (Match match in regex.Matches(post))
{
string videoUrl = match.Groups[1].Value;

if (shellService.Settings.VideoSize == 480)
videoUrl += "_480";

AddToDownloadList(new VideoPost(videoUrl + ".mp4", Guid.NewGuid().ToString("N")));
}
}

protected void AddGenericPhotoUrl(string post)
{
foreach (string imageUrl in tumblrParser.SearchForGenericPhotoUrl(post))
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,9 +461,9 @@ private void AddVideoUrlToDownloadList(Post post)

//var videoUrls = new HashSet<string>();

AddInlineVideoUrl(postCopy);
AddInlineTumblrVideoUrl(postCopy, new Regex("\"(https?://ve.media.tumblr.com/(tumblr_[\\w]*))"));
AddInlineTumblrVideoUrl(postCopy, new Regex("\"(https?://vtt.tumblr.com/(tumblr_[\\w]*))"));
AddTumblrVideoUrl(InlineSearch(postCopy));
AddInlineTumblrVideoUrl(InlineSearch(postCopy), tumblrParser.GetTumblrVeVideoUrlRegex());
AddInlineTumblrVideoUrl(InlineSearch(postCopy), tumblrParser.GetTumblrVttVideoUrlRegex());
if (blog.RegExVideos)
AddGenericInlineVideoUrl(postCopy);

Expand All @@ -490,24 +490,6 @@ private void AddVideoUrl(Post post)
AddToJsonQueue(new TumblrCrawlerData<Post>(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post));
}

private void AddInlineVideoUrl(Post post)
{
AddTumblrVideoUrl(InlineSearch(post));
}

private void AddInlineTumblrVideoUrl(Post post, Regex regex)
{
foreach (Match match in regex.Matches(InlineSearch(post)))
{
string videoUrl = match.Groups[1].Value;

if (shellService.Settings.VideoSize == 480)
videoUrl += "_480";

AddToDownloadList(new VideoPost(videoUrl + ".mp4", Guid.NewGuid().ToString("N")));
}
}

private void AddGenericInlineVideoUrl(Post post)
{
AddGenericVideoUrl(InlineSearch(post));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -461,9 +461,9 @@ private void AddVideoUrlToDownloadList(Post post)

//var videoUrls = new HashSet<string>();

AddInlineVideoUrl(postCopy);
AddInlineTumblrVideoUrl(postCopy, new Regex("\"(https?://ve.media.tumblr.com/(tumblr_[\\w]*))"));
AddInlineTumblrVideoUrl(postCopy, new Regex("\"(https?://vtt.tumblr.com/(tumblr_[\\w]*))"));
AddTumblrVideoUrl(InlineSearch(postCopy));
AddInlineTumblrVideoUrl(InlineSearch(postCopy), tumblrParser.GetTumblrVeVideoUrlRegex());
AddInlineTumblrVideoUrl(InlineSearch(postCopy), tumblrParser.GetTumblrVttVideoUrlRegex());
if (blog.RegExVideos)
AddGenericInlineVideoUrl(postCopy);

Expand All @@ -490,24 +490,6 @@ private void AddVideoUrl(Post post)
AddToJsonQueue(new TumblrCrawlerData<Post>(Path.ChangeExtension(videoUrl.Split('/').Last(), ".json"), post));
}

private void AddInlineVideoUrl(Post post)
{
AddTumblrVideoUrl(InlineSearch(post));
}

private void AddInlineTumblrVideoUrl(Post post, Regex regex)
{
foreach (Match match in regex.Matches(InlineSearch(post)))
{
string videoUrl = match.Groups[1].Value;

if (shellService.Settings.VideoSize == 480)
videoUrl += "_480";

AddToDownloadList(new VideoPost(videoUrl + ".mp4", Guid.NewGuid().ToString("N")));
}
}

private void AddGenericInlineVideoUrl(Post post)
{
AddGenericVideoUrl(InlineSearch(post));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -201,6 +201,8 @@ private void AddVideoUrlToDownloadList(string document)
if (!blog.DownloadVideo)
return;
AddTumblrVideoUrl(document);
AddInlineTumblrVideoUrl(document, tumblrParser.GetTumblrVeVideoUrlRegex());
AddInlineTumblrVideoUrl(document, tumblrParser.GetTumblrVttVideoUrlRegex());

if (blog.RegExVideos)
AddGenericVideoUrl(document);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,8 @@ private void AddVideoUrlToDownloadList(string document)
if (!blog.DownloadVideo)
return;
AddTumblrVideoUrl(document);
AddInlineTumblrVideoUrl(document, tumblrParser.GetTumblrVeVideoUrlRegex());
AddInlineTumblrVideoUrl(document, tumblrParser.GetTumblrVttVideoUrlRegex());

if (blog.RegExVideos)
AddGenericVideoUrl(document);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public class TumblrParser : ITumblrParser

public Regex GetTumblrVttVideoUrlRegex() => new Regex("\"(https?://vtt.tumblr.com/(tumblr_[\\w]*))");

public Regex GetTumblrInlineVideoUrlRegex() => new Regex("\"(http[A-Za-z0-9_/:.]*.com/video_file/[A-Za-z0-9_/:.]*)\"");
public Regex GetTumblrInlineVideoUrlRegex() => new Regex("\"(http[A-Za-z0-9_/:.]*video_file[\\S]*/(tumblr_[\\w]*))[0-9/]*\"");

public Regex GetGenericVideoUrlRegex() => new Regex("\"(https?://(?:[a-z0-9\\-]+\\.)+[a-z]{2,6}(?:/[^/#?]+)+\\.(?:mp4|mkv|gifv))\"");

Expand Down

0 comments on commit 627a0e5

Please sign in to comment.