Skip to content

Commit

Permalink
Bug fixes
Browse files Browse the repository at this point in the history
+ Support for large course content in subscription courses (#92)
+ Attempt to fix encoding problems with caption conversion (#92, #98, #97)
  • Loading branch information
Puyodead1 committed Jan 13, 2022
1 parent 636b046 commit 94eb2ac
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 15 deletions.
94 changes: 81 additions & 13 deletions main.py
Original file line number Diff line number Diff line change
Expand Up @@ -761,22 +761,28 @@ def _extract_course_json_sub(self, selenium: Selenium, course_id, portal_name):

# wait for page load
WebDriverWait(selenium.driver, 60).until(
EC.visibility_of_element_located((By.TAG_NAME, "pre")))
EC.visibility_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)

# TODO: determine if the course content is large

# get the text from the page
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
if not page_text or not isinstance(page_text, str):
raise Exception("[-] Could not get page text!")
page_json = json.loads(page_text)
if page_json:
return page_json
body_text = selenium.driver.find_element(By.TAG_NAME, "body").text
if not body_text:
raise Exception("[-] Could not get page body text!")
if "502 Bad Gateway" in body_text:
# its a large course, handle accordingly
logger.info("[+] Detected large course content, using large content extractor...")
return self._extract_large_course_content_sub(url=url, selenium=selenium)
else:
logger.error("[-] Failed to extract course json!")
time.sleep(0.8)
sys.exit(1)
# get the text from the page
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
if not page_text or not isinstance(page_text, str):
raise Exception("[-] Could not get page pre text!")
page_json = json.loads(page_text)
if page_json:
return page_json
else:
logger.error("[-] Failed to extract course json!")
time.sleep(0.8)
sys.exit(1)

def _extract_large_course_content(self, url):
url = url.replace("10000", "50") if url.endswith("10000") else url
Expand Down Expand Up @@ -804,6 +810,68 @@ def _extract_large_course_content(self, url):
data["results"].append(d)
return data

def _extract_large_course_content_sub(self, url, selenium: Selenium):
url = url.replace("10000", "50") if url.endswith("10000") else url
try:
selenium.driver.get(url)
time.sleep(2)

if "Attention" in selenium.driver.title:
# cloudflare captcha, panic
raise Exception("[-] Cloudflare captcha detected!")

# wait for page load
WebDriverWait(selenium.driver, 60).until(
EC.visibility_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)

# get the text from the page
page_text = selenium.driver.find_element(By.TAG_NAME, "pre").text
if not page_text or not isinstance(page_text, str):
raise Exception("[-] Could not get page pre text!")
data = json.loads(page_text)
logger.debug(data)

except conn_error as error:
logger.fatal(f"[-] Udemy Says: Connection error, {error}")
time.sleep(0.8)
sys.exit(1)
else:
_next = data.get("next")
while _next:
logger.info("> Downloading course information.. ")
try:
selenium.driver.get(_next)
time.sleep(2)

if "Attention" in selenium.driver.title:
# cloudflare captcha, panic
raise Exception("[-] Cloudflare captcha detected!")

# wait for page load
WebDriverWait(selenium.driver, 60).until(
EC.visibility_of_element_located((By.TAG_NAME, "body")))
time.sleep(2)

# get the text from the page
page_text = selenium.driver.find_element(
By.TAG_NAME, "pre").text
if not page_text or not isinstance(page_text, str):
raise Exception("[-] Could not get page pre text!")
resp = json.loads(page_text)
logger.debug(resp)
except conn_error as error:
logger.fatal(f"[-] Udemy Says: Connection error, {error}")
time.sleep(0.8)
sys.exit(1)
else:
_next = resp.get("next")
results = resp.get("results")
if results and isinstance(results, list):
for d in resp["results"]:
data["results"].append(d)
return data

def _extract_course(self, response, course_name):
_temp = {}
if response:
Expand Down
5 changes: 3 additions & 2 deletions vtt_to_srt.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from webvtt import WebVTT
import html, os
import html
import os
from pysrt.srtitem import SubRipItem
from pysrt.srttime import SubRipTime

Expand All @@ -8,7 +9,7 @@ def convert(directory, filename):
index = 0
vtt_filepath = os.path.join(directory, filename + ".vtt")
srt_filepath = os.path.join(directory, filename + ".srt")
srt = open(srt_filepath, "w")
srt = open(srt_filepath, mode='w', encoding='utf8', errors='ignore')

for caption in WebVTT().read(vtt_filepath):
index += 1
Expand Down

0 comments on commit 94eb2ac

Please sign in to comment.