Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ISSUE-28] Cache initial HTML page retrieval #31

Merged
merged 3 commits into from
Oct 16, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions source/app.d
Original file line number Diff line number Diff line change
Expand Up @@ -9,9 +9,10 @@ import std.range : iota;
import std.logger;
import std.getopt;

import parsers;
import downloaders;
import helpers;
import parsers : YoutubeFormat, YoutubeVideoURLExtractor;
import cache : Cache;

void main(string[] args)
{
Expand Down Expand Up @@ -74,10 +75,10 @@ void main(string[] args)

void handleURL(string url, int itag, StdoutLogger logger, bool displayFormats, bool outputURL, bool parallel, bool noProgress)
{
auto cache = Cache(logger);
logger.display(formatTitle("Handling " ~ url));
string html = url.get().idup;
logger.displayVerbose("Downloaded video HTML");
YoutubeVideoURLExtractor parser = makeParser(html, logger);
YoutubeVideoURLExtractor parser = cache.makeParser(url, itag);

if(displayFormats)
{
Expand Down
167 changes: 167 additions & 0 deletions source/cache.d
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
import std.stdio : writeln;
import std.array : replace;
import std.base64 : Base64URL;
import std.conv : to;
import std.datetime : SysTime, Clock, days;
import std.file : exists, getcwd, readText, tempDir, write;
import std.net.curl : get;
import std.path : buildPath;
import std.string : indexOf;

import helpers : StdoutLogger, parseID, parseQueryString;
import parsers : parseBaseJSURL, YoutubeVideoURLExtractor, SimpleYoutubeVideoURLExtractor, AdvancedYoutubeVideoURLExtractor;

struct Cache
{
private StdoutLogger logger;
private string delegate(string url) downloadAsString;
string cacheDirectory;

this(StdoutLogger logger)
{
this.logger = logger;
downloadAsString = (string url) => url.get().idup;
cacheDirectory = tempDir();
}

this(StdoutLogger logger, string delegate(string url) downloadAsString)
{
this(logger);
this.downloadAsString = downloadAsString;
}

YoutubeVideoURLExtractor makeParser(string url, int itag)
{
string html = getHTML(url, itag);
if(html.indexOf("signatureCipher") == -1)
{
return new SimpleYoutubeVideoURLExtractor(html, logger);
}
string baseJS = getBaseJS(url, itag);
return new AdvancedYoutubeVideoURLExtractor(html, baseJS, logger);
}

private string getHTML(string url, int itag)
{
string htmlCachePath = getCachePath(url) ~ ".html";
string baseJSCachePath = getCachePath(url) ~ ".js";
updateCache(url, htmlCachePath, baseJSCachePath, itag);
return htmlCachePath.readText();
}

private string getBaseJS(string url, int itag)
{
string htmlCachePath = getCachePath(url) ~ ".html";
string baseJSCachePath = getCachePath(url) ~ ".js";
updateCache(url, htmlCachePath, baseJSCachePath, itag);
return baseJSCachePath.readText();
}

private void updateCache(string url, string htmlCachePath, string baseJSCachePath, int itag)
{
bool shouldRedownload = !htmlCachePath.exists() || isStale(htmlCachePath.readText(), itag);
if(shouldRedownload)
{
logger.display("Cache miss, downloading HTML...");
string html = this.downloadAsString(url);
htmlCachePath.write(html);
string baseJS = this.downloadAsString(html.parseBaseJSURL());
baseJSCachePath.write(baseJS);
}
else
{
logger.display("Cache hit, skipping HTML download...");
}
}

private bool isStale(string html, int itag)
{
YoutubeVideoURLExtractor shallowParser = html.indexOf("signatureCipher") == -1
? new SimpleYoutubeVideoURLExtractor(html, logger)
: new AdvancedYoutubeVideoURLExtractor(html, "", logger);
ulong expire = shallowParser.findExpirationTimestamp(itag);
return SysTime.fromUnixTime(expire) < Clock.currTime();
}

private string getCachePath(string url)
{
string cacheKey = url.parseID();
if(cacheKey == "")
{
cacheKey = Base64URL.encode(cast(ubyte[]) url);
}

return buildPath(cacheDirectory, cacheKey);
}
}

unittest
{
writeln("Given SimpleYoutubeVideoURLExtractor, when cache is stale, should redownload HTML");
bool downloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
return "zoz.html".readText();
};
auto cache = Cache(new StdoutLogger(), downloadAsString);
cache.cacheDirectory = getcwd();

auto parser = cache.makeParser("https://youtu.be/zoz", 18);
assert(downloadAttempted);
}

unittest
{
writeln("Given SimpleYoutubeVideoURLExtractor, when cache is fresh, should not download HTML");
bool downloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
return "zoz.html".readText();
};
SysTime tomorrow = Clock.currTime() + 1.days;
auto cache = Cache(new StdoutLogger(), downloadAsString);
cache.cacheDirectory = getcwd();

"zoz-fresh.html".write("zoz.html".readText().dup.replace("expire=1638935038", "expire=" ~ tomorrow.toUnixTime().to!string));

auto parser = cache.makeParser("https://youtu.be/zoz-fresh", 18);
assert(!downloadAttempted);
}

unittest
{
writeln("Given AdvancedYoutubeVideoURLExtractor, when cache is stale, should redownload HTML");
bool downloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
return "dQw4w9WgXcQ.html".readText();
};
auto cache = Cache(new StdoutLogger(), downloadAsString);
cache.cacheDirectory = getcwd();

auto parser = cache.makeParser("https://youtu.be/dQw4w9WgXcQ", 18);
assert(downloadAttempted);
}

unittest
{
writeln("Given AdvancedYoutubeVideoURLExtractor, when cache is fresh, should not download HTML");
bool downloadAttempted;
auto downloadAsString = delegate string(string url) {
downloadAttempted = true;
return "dQw4w9WgXcQ-fresh.html".readText();
};
SysTime tomorrow = Clock.currTime() + 1.days;
auto cache = Cache(new StdoutLogger(), downloadAsString);
cache.cacheDirectory = getcwd();

//mock previously cached and fresh files
"dQw4w9WgXcQ-fresh.js".write("base.min.js".readText());
"dQw4w9WgXcQ-fresh.html".write(
"dQw4w9WgXcQ.html".readText().dup.replace("expire%3D1677997809", "expire%3D" ~ tomorrow.toUnixTime().to!string)
);


auto parser = cache.makeParser("https://youtu.be/dQw4w9WgXcQ-fresh", 18);
assert(!downloadAttempted);
}
39 changes: 38 additions & 1 deletion source/helpers.d
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,8 @@ import std.regex : ctRegex, matchFirst, escaper, regex, Captures;
import std.algorithm : filter;
import std.conv : to;
import std.net.curl : HTTP;
import std.string : split;
import std.string : split, indexOf, startsWith;
import std.format : formattedRead;

ulong getContentLength(string url)
{
Expand Down Expand Up @@ -33,6 +34,11 @@ string sanitizePath(string path)

string[string] parseQueryString(string input)
{
auto questionMarkIndex = input.indexOf("?");
if(questionMarkIndex != -1)
{
input = input[questionMarkIndex + 1 .. $];
}
string[string] result;
foreach(params; input.split("&"))
{
Expand Down Expand Up @@ -158,3 +164,34 @@ string formatTitle(string input)
{
return "\033[1m" ~ input ~ "\033[0m";
}

string parseID(string url)
{
string id;
if(!url.startsWith("https://"))
{
url = "https://" ~ url;
}

if(url.indexOf("?v=") != -1)
{
return url[url.indexOf("?") + 1 .. $].parseQueryString()["v"];
}
if(url.indexOf("youtu.be") != -1)
{
url.formattedRead!"https://youtu.be/%s"(id);
}
if(url.indexOf("shorts") != -1)
{
url.formattedRead!"https://www.youtube.com/shorts/%s"(id);
}
return id;
}

unittest
{
assert("https://www.youtube.com/watch?v=-H-Fno9xbE4".parseID() == "-H-Fno9xbE4");
assert("https://youtu.be/-H-Fno9xbE4".parseID() == "-H-Fno9xbE4");
assert("https://www.youtube.com/shorts/_tT2ldpZHek".parseID() == "_tT2ldpZHek");
assert("qlsdkqsldkj".parseID() == "");
}
19 changes: 17 additions & 2 deletions source/parsers.d
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ abstract class YoutubeVideoURLExtractor
protected Document parser;

abstract public string getURL(int itag = 18);
abstract public ulong findExpirationTimestamp(int itag);

public string getTitle()
{
Expand Down Expand Up @@ -89,6 +90,13 @@ class SimpleYoutubeVideoURLExtractor : YoutubeVideoURLExtractor
.matchOrFail(`"itag":` ~ itag.to!string ~ `,"url":"(.*?)"`)
.replace(`\u0026`, "&");
}

override ulong findExpirationTimestamp(int itag)
{
string videoURL = getURL(itag);
string[string] params = videoURL.parseQueryString();
return params["expire"].to!ulong;
}
}

unittest
Expand Down Expand Up @@ -199,7 +207,14 @@ class AdvancedYoutubeVideoURLExtractor : YoutubeVideoURLExtractor
return params["url"].decodeComponent() ~ "&" ~ params["sp"] ~ "=" ~ sig;
}

private string findSignatureCipher(int itag)
override ulong findExpirationTimestamp(int itag)
{
string signatureCipher = findSignatureCipher(itag);
string[string] params = signatureCipher.parseQueryString()["url"].decodeComponent().parseQueryString();
return params["expire"].to!int;
}

string findSignatureCipher(int itag)
{
string encoded = "itag%3D" ~ itag.to!string;
long index = html.indexOf(encoded);
Expand Down Expand Up @@ -282,7 +297,7 @@ YoutubeVideoURLExtractor makeParser(string html, StdoutLogger logger)
return makeParser(html, baseJSURL => baseJSURL.get().idup, logger);
}

YoutubeVideoURLExtractor makeParser(string html, string function(string) performGETRequest, StdoutLogger logger)
YoutubeVideoURLExtractor makeParser(string html, string delegate(string) performGETRequest, StdoutLogger logger)
{
if(html.canFind("signatureCipher"))
{
Expand Down
Loading