Bug fixes

+ Support for large course content in subscription courses (#92) + Attempt to fix encoding problems with caption conversion (#92, #98, #97)
Puyodead1 · Jan 13, 2022 · 94eb2ac · 94eb2ac
1 parent 636b046
commit 94eb2ac
Show file tree

Hide file tree

Showing 2 changed files with 84 additions and 15 deletions.
diff --git a/main.py b/main.py
@@ -761,22 +761,28 @@ def _extract_course_json_sub(self, selenium: Selenium, course_id, portal_name):
 
         # wait for page load
         WebDriverWait(selenium.driver, 60).until(
-            EC.visibility_of_element_located((By.TAG_NAME, "pre")))
+            EC.visibility_of_element_located((By.TAG_NAME, "body")))
         time.sleep(2)
 
-        # TODO: determine if the course content is large
-
-        # get the text from the page
-        page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
-        if not page_text or not isinstance(page_text, str):
-            raise Exception("[-] Could not get page text!")
-        page_json = json.loads(page_text)
-        if page_json:
-            return page_json
+        body_text = selenium.driver.find_element(By.TAG_NAME, "body").text
+        if not body_text:
+            raise Exception("[-] Could not get page body text!")
+        if "502 Bad Gateway" in body_text:
+            # its a large course, handle accordingly
+            logger.info("[+] Detected large course content, using large content extractor...")
+            return self._extract_large_course_content_sub(url=url, selenium=selenium)
         else:
-            logger.error("[-] Failed to extract course json!")
-            time.sleep(0.8)
-            sys.exit(1)
+            # get the text from the page
+            page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
+            if not page_text or not isinstance(page_text, str):
+                raise Exception("[-] Could not get page pre text!")
+            page_json = json.loads(page_text)
+            if page_json:
+                return page_json
+            else:
+                logger.error("[-] Failed to extract course json!")
+                time.sleep(0.8)
+                sys.exit(1)
 
     def _extract_large_course_content(self, url):
         url = url.replace("10000", "50") if url.endswith("10000") else url
@@ -804,6 +810,68 @@ def _extract_large_course_content(self, url):
                             data["results"].append(d)
             return data
 
+    def _extract_large_course_content_sub(self, url, selenium: Selenium):
+        url = url.replace("10000", "50") if url.endswith("10000") else url
+        try:
+            selenium.driver.get(url)
+            time.sleep(2)
+
+            if "Attention" in selenium.driver.title:
+                # cloudflare captcha, panic
+                raise Exception("[-] Cloudflare captcha detected!")
+
+            # wait for page load
+            WebDriverWait(selenium.driver, 60).until(
+                EC.visibility_of_element_located((By.TAG_NAME, "body")))
+            time.sleep(2)
+
+            # get the text from the page
+            page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
+            if not page_text or not isinstance(page_text, str):
+                raise Exception("[-] Could not get page pre text!")
+            data = json.loads(page_text)
+            logger.debug(data)
+
+        except conn_error as error:
+            logger.fatal(f"[-] Udemy Says: Connection error, {error}")
+            time.sleep(0.8)
+            sys.exit(1)
+        else:
+            _next = data.get("next")
+            while _next:
+                logger.info("> Downloading course information.. ")
+                try:
+                    selenium.driver.get(_next)
+                    time.sleep(2)
+
+                    if "Attention" in selenium.driver.title:
+                        # cloudflare captcha, panic
+                        raise Exception("[-] Cloudflare captcha detected!")
+
+                    # wait for page load
+                    WebDriverWait(selenium.driver, 60).until(
+                        EC.visibility_of_element_located((By.TAG_NAME, "body")))
+                    time.sleep(2)
+
+                    # get the text from the page
+                    page_text = selenium.driver.find_element(
+                        By.TAG_NAME, "pre").text
+                    if not page_text or not isinstance(page_text, str):
+                        raise Exception("[-] Could not get page pre text!")
+                    resp = json.loads(page_text)
+                    logger.debug(resp)
+                except conn_error as error:
+                    logger.fatal(f"[-] Udemy Says: Connection error, {error}")
+                    time.sleep(0.8)
+                    sys.exit(1)
+                else:
+                    _next = resp.get("next")
+                    results = resp.get("results")
+                    if results and isinstance(results, list):
+                        for d in resp["results"]:
+                            data["results"].append(d)
+            return data
+
     def _extract_course(self, response, course_name):
         _temp = {}
         if response:

diff --git a/vtt_to_srt.py b/vtt_to_srt.py
@@ -1,5 +1,6 @@
 from webvtt import WebVTT
-import html, os
+import html
+import os
 from pysrt.srtitem import SubRipItem
 from pysrt.srttime import SubRipTime
 
@@ -8,7 +9,7 @@ def convert(directory, filename):
     index = 0
     vtt_filepath = os.path.join(directory, filename + ".vtt")
     srt_filepath = os.path.join(directory, filename + ".srt")
-    srt = open(srt_filepath, "w")
+    srt = open(srt_filepath, mode='w', encoding='utf8', errors='ignore')
 
     for caption in WebVTT().read(vtt_filepath):
         index += 1