Skip to content

Commit

Permalink
Added better regex for style and jupterlab
Browse files Browse the repository at this point in the history
  • Loading branch information
richardautry committed Aug 29, 2021
1 parent f6abe07 commit e82b009
Show file tree
Hide file tree
Showing 3 changed files with 196 additions and 3 deletions.
120 changes: 120 additions & 0 deletions notebooks/Untitled.ipynb
Original file line number Diff line number Diff line change
@@ -0,0 +1,120 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 17,
"id": "9c507e95-0326-4cc4-9685-72a217993db8",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(?!jQuery.)(.*\\bdark).*|(?!jQuery.)(.*\\bDARK).*|(?!jQuery.)(.*\\bDark).*\n"
]
}
],
"source": [
"tag = \"dark\"\n",
"\n",
"original_regex_tag = \"(?!jQuery.)(.*\\\\b{}).*\"\n",
"regex_tag = original_regex_tag.format(tag)\n",
"for f in [\"upper\", \"title\"]:\n",
" altered_regex = original_regex_tag.format(getattr(tag, f)())\n",
" regex_tag += f\"|{altered_regex}\"\n",
"\n",
"print(regex_tag)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "3d201729-3e92-4e57-8c1b-c420c6506625",
"metadata": {},
"outputs": [
{
"ename": "SyntaxError",
"evalue": "invalid syntax (2522250776.py, line 3)",
"output_type": "error",
"traceback": [
"\u001b[0;36m File \u001b[0;32m\"/tmp/ipykernel_37576/2522250776.py\"\u001b[0;36m, line \u001b[0;32m3\u001b[0m\n\u001b[0;31m regex_tag += f\"|{regex_tag.format(getattr(tag, \"upper\")())}\"\u001b[0m\n\u001b[0m ^\u001b[0m\n\u001b[0;31mSyntaxError\u001b[0m\u001b[0;31m:\u001b[0m invalid syntax\n"
]
}
],
"source": [
"regex_tag = \"this{}\"\n",
"\n",
"regex_tag += f\"|{regex_tag.format(getattr(tag, \"upper\")())}\"\n",
"\n",
"print(regex_tag)\n"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "8a0f5e8a-79c4-46d5-9fda-813b93f97184",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"thisTHISthisTHIS\n"
]
}
],
"source": [
"tag = \"this\"\n",
"\n",
"getattr(tag, \"upper\")()\n",
"\n",
"regex_tag = \"this{}\".format(getattr(tag, \"upper\")())\n",
"\n",
"regex_tag += f\"{regex_tag}\"\n",
"\n",
"print(regex_tag)\n"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "b5d11363-60c8-4609-bfaa-2af4529825cb",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"upper\n",
"this\n"
]
}
],
"source": [
"for f in [\"upper\", \"this\"]:\n",
" print(f)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.10"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
33 changes: 31 additions & 2 deletions pub-crawler/pub-crawler/spiders/beer_spider.py
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,40 @@ def extract_value(field_spelling: List[str], regex: List[str]) -> [str, None]:
def extract_style():
# TODO: Generalize this function to take spellings and return
style_spelling = ['style', 'beer style']
style_tags = ['dark', 'saison', 'red', 'wine', 'barrel', 'aged']
style_tags = [
'dark',
'saison',
'red',
'wine',
'barrel',
'aged',
'russian',
'imperial',
'stout',
'lager',
'IPA',
'india pale ale',
'hazy',
'pils',
'pilsner'
]

matches = []

# TODO: This idea doesn't work yet. It doesn't find styles that exist and picks up a lot of jquery baggage along the way
for tag in style_tags:
regex_tag = f"(.*{tag}.*)|(.*{tag.upper()}.*)|(.*{tag.title()}.*)"
# TODO: use this regex to get ride of jQuery: (?!jQuery\.extend)(.*\bdark).*
# regex_tag = f"(?!jQuery.extend)(.*\\b{tag}).*|(?!jQuery.extend)(.*\\b{tag.upper()}).*|(?!jQuery.extend)(.*\\b{tag.title()}).*"

# TODO: More often than not, this is returning a bunch of '' matches. not sure what is happening.
original_regex_tag = "(?<!jQuery.)(.*\\b{}).*"
regex_tag = original_regex_tag.format(tag)
for f in ["upper", "title"]:
altered_regex = original_regex_tag.format(getattr(tag, f)())
regex_tag += f"|{altered_regex}"

print(f"REGEX TAG: {regex_tag}")

# Add text that includes the given spellings of a style tag
# matches.append(extract_value([], [regex_tag]))
current_matches = response.xpath("//text()").re(regex_tag)
Expand All @@ -114,6 +141,8 @@ def extract_style():
# Collect counts of each
counts_dict = {match: matches.count(match) for match in matches}

print(f"COUNTS DICT: {counts_dict}")

if counts_dict:
return max(counts_dict, key=counts_dict.get)

Expand Down
46 changes: 45 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
@@ -1,31 +1,60 @@
anyio==3.3.0
argon2-cffi==20.1.0
attrs==21.2.0
Automat==20.2.0
Babel==2.9.1
backcall==0.2.0
bleach==4.1.0
certifi==2021.5.30
cffi==1.14.6
charset-normalizer==2.0.4
constantly==15.1.0
cryptography==3.4.7
cssselect==1.1.0
debugpy==1.4.1
decorator==5.0.9
defusedxml==0.7.1
entrypoints==0.3
h2==3.2.0
hpack==3.0.0
hyperframe==5.2.0
hyperlink==21.0.0
idna==3.2
incremental==21.3.0
ipykernel==6.2.0
ipython==7.26.0
ipython-genutils==0.2.0
itemadapter==0.3.0
itemloaders==1.0.4
jedi==0.18.0
Jinja2==3.0.1
jmespath==0.10.0
json5==0.9.6
jsonschema==3.2.0
jupyter-client==7.0.1
jupyter-core==4.7.1
jupyter-server==1.10.2
jupyterlab==3.1.9
jupyterlab-pygments==0.1.2
jupyterlab-server==2.7.2
lxml==4.6.3
MarkupSafe==2.0.1
matplotlib-inline==0.1.2
mistune==0.8.4
nbclassic==0.3.1
nbclient==0.5.4
nbconvert==6.1.0
nbformat==5.1.3
nest-asyncio==1.5.1
notebook==6.4.3
packaging==21.0
pandocfilters==1.4.3
parsel==1.6.0
parso==0.8.2
pexpect==4.8.0
pickleshare==0.7.5
pkg-resources==0.0.0
priority==1.3.0
prometheus-client==0.11.0
prompt-toolkit==3.0.19
Protego==0.1.16
ptyprocess==0.7.0
Expand All @@ -35,12 +64,27 @@ pycparser==2.20
PyDispatcher==2.0.5
Pygments==2.9.0
pyOpenSSL==20.0.1
pyparsing==2.4.7
pyrsistent==0.18.0
python-dateutil==2.8.2
pytz==2021.1
pyzmq==22.2.1
queuelib==1.6.1
requests==2.26.0
requests-unixsocket==0.2.0
Scrapy==2.5.0
Send2Trash==1.8.0
service-identity==21.1.0
six==1.16.0
sniffio==1.2.0
terminado==0.11.1
testpath==0.5.0
tornado==6.1
traitlets==5.0.5
Twisted==21.2.0
urllib3==1.26.6
w3lib==1.22.0
wcwidth==0.2.5
webencodings==0.5.1
websocket-client==1.2.1
zope.interface==5.4.0

0 comments on commit e82b009

Please sign in to comment.