Skip to content

Commit

Permalink
Clean up code, no functional changes
Browse files Browse the repository at this point in the history
  • Loading branch information
riley-martine committed May 8, 2022
1 parent a89e291 commit 8d73dd9
Showing 1 changed file with 44 additions and 29 deletions.
73 changes: 44 additions & 29 deletions main.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,9 @@
import argparse
import string
from bs4 import BeautifulSoup
import urllib.request
import time
import os
import argparse
import re
import urllib.request

from bs4 import BeautifulSoup

API = "https://www.urbandictionary.com/browse.php?character={0}"

Expand All @@ -18,27 +17,29 @@
class NoRedirection(urllib.request.HTTPErrorProcessor):
def http_response(self, request, response):
return response

https_response = http_response

def extract_page_entries(letter, html):

def extract_page_entries(html):
soup = BeautifulSoup(html, "html.parser")
# find word list element, this might change in the future
list = soup.find_all("ul", class_="mt-3 columns-2 md:columns-3")[0]
for li in list.find_all('li'):
a = li.find('a').string
ul = soup.find_all("ul", class_="mt-3 columns-2 md:columns-3")[0]
for li in ul.find_all("li"):
a = li.find("a").string
if a:
# print(a)
yield a

def get_next(letter, html):

def get_next(html):
soup = BeautifulSoup(html, "html.parser")
next = soup.find('a', {"rel":"next"})
if next:
href = next['href']
return 'https://www.urbandictionary.com' + href
next_link = soup.find("a", {"rel": "next"})
if next_link:
href = next_link["href"]
return "https://www.urbandictionary.com" + href
return None


def extract_letter_entries(letter):
url = API.format(letter)
attempt = 0
Expand All @@ -48,8 +49,8 @@ def extract_letter_entries(letter):
code = response.getcode()
if code == 200:
content = response.read()
yield list(extract_page_entries(letter, content))
url = get_next(letter, content)
yield list(extract_page_entries(content))
url = get_next(content)
attempt = 0
else:
print(f"Trying again, expected response code: 200, got {code}")
Expand All @@ -58,36 +59,50 @@ def extract_letter_entries(letter):
break
time.sleep(DELAY * attempt)

opener = urllib.request.build_opener(NoRedirection, urllib.request.HTTPCookieProcessor())

opener = urllib.request.build_opener(
NoRedirection, urllib.request.HTTPCookieProcessor()
)
urllib.request.install_opener(opener)


letters = list(string.ascii_uppercase) + ['#']
letters = list(string.ascii_uppercase) + ["#"]


def download_letter_entries(letter, file):
file = file.format(letter)
for entry_set in extract_letter_entries(letter):
with open(file, 'a+', encoding='utf-8') as f:
data = ('\n'.join(entry_set))
f.write(data + '\n')
with open(file, "a+", encoding="utf-8") as f:
data = "\n".join(entry_set)
f.write(data + "\n")


def download_entries(letters, file):
for letter in letters:
print(f"======={letter}=======")
download_letter_entries(letter, file)

parser = argparse.ArgumentParser(description='Process some integers.')

parser.add_argument('--ifile', dest='ifile',
help='input file name. Contains a list of letters separated by a newline', default="input.list")
parser = argparse.ArgumentParser(description="Download urban dictionary words.")

parser.add_argument(
"--ifile",
dest="ifile",
help="input file name. Contains a list of letters separated by a newline",
default="input.list",
)

parser.add_argument('--out', dest='out',
help='output file name. May be a format string', default="data/{0}.data")
parser.add_argument(
"--out",
dest="out",
help="output file name. May be a format string",
default="data/{0}.data",
)

args = parser.parse_args()

letters = []
with open(args.ifile, 'r') as ifile:
with open(args.ifile, "r") as ifile:
for row in ifile:
letters.append(row.strip())

Expand Down

0 comments on commit 8d73dd9

Please sign in to comment.