Skip to content

Commit

Permalink
Add prettify funcionality; Update docs
Browse files Browse the repository at this point in the history
  • Loading branch information
fadoaglauss committed Oct 16, 2020
1 parent 03d9f8d commit d9ea9f4
Show file tree
Hide file tree
Showing 16 changed files with 1,232 additions and 209 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.ipynb_checkpoints
crawler/__pycache__
.vscode
util/__pycache__
195 changes: 154 additions & 41 deletions Coding Dojo - Crawler.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"<img style=\"float:left\" src=\"imgs/vamos_coletar.png\">\n",
"<img style=\"float:left\" src=\"docs/imgs/vamos_coletar.png\">\n",
"## Regras\n",
"\n",
"Ponto de participação, poderá perdido se:\n",
Expand Down Expand Up @@ -50,9 +50,27 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: bs4 in /home/fadoaglauss/.local/lib/python3.8/site-packages (0.0.1)\n",
"Requirement already satisfied: beautifulsoup4 in /home/fadoaglauss/.local/lib/python3.8/site-packages (from bs4) (4.9.1)\n",
"Requirement already satisfied: soupsieve>1.2 in /home/fadoaglauss/.local/lib/python3.8/site-packages (from beautifulsoup4->bs4) (2.0.1)\n",
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: requests in /usr/lib/python3.8/site-packages (2.24.0)\n",
"Requirement already satisfied: chardet>=3.0.2 in /usr/lib/python3.8/site-packages (from requests) (3.0.4)\n",
"Requirement already satisfied: idna>=2.5 in /usr/lib/python3.8/site-packages (from requests) (2.10)\n",
"Requirement already satisfied: urllib3>=1.21.1 in /usr/lib/python3.8/site-packages (from requests) (1.25.10)\n",
"Defaulting to user installation because normal site-packages is not writeable\n",
"Requirement already satisfied: lxml in /home/fadoaglauss/.local/lib/python3.8/site-packages (4.5.2)\n"
]
}
],
"source": [
"!pip install bs4\n",
"!pip install requests\n",
Expand All @@ -77,7 +95,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"imgs/arquitetura_coletor.png\">"
"<img src=\"docs/imgs/arquitetura_coletor.png\">"
]
},
{
Expand All @@ -99,7 +117,7 @@
"metadata": {},
"source": [
"<figure style=\"text-align:center\">\n",
" <img src=\"imgs/estrutura_coletor.png\">\n",
" <img src=\"docs/imgs/estrutura_coletor.png\">\n",
" <caption>Fonte: Baeza-Yates e Ribeiro-Neto, 2011 </caption>\n",
"</figure>"
]
Expand Down Expand Up @@ -141,9 +159,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Verificando acesso a um dominio já requisitado (após espera)\n",
"aguardando 10 segundos...\n",
".\n",
"----------------------------------------------------------------------\n",
"Ran 1 test in 10.011s\n",
"\n",
"OK\n"
]
}
],
"source": [
"!python -m crawler.scheduler_test DomainTest.test_domain"
]
Expand All @@ -157,9 +189,17 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"True\n"
]
}
],
"source": [
"from collections import OrderedDict\n",
"class Xuxu():\n",
Expand All @@ -184,9 +224,18 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"oi\n",
"oi\n"
]
}
],
"source": [
"from crawler.domain import Domain\n",
"from collections import OrderedDict\n",
Expand Down Expand Up @@ -227,7 +276,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
Expand Down Expand Up @@ -290,11 +339,26 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 21,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Verificação da ordem das URLs...\n",
"Resgatando a segunda página do mesmo dominio...\n",
"Tempo esperado: 30 segundos\n",
".\n",
"----------------------------------------------------------------------\n",
"Ran 1 test in 30.597s\n",
"\n",
"OK\n"
]
}
],
"source": [
"!python -m crawler.scheduler_test SchedulerTest.test_add_remove_pages"
]
Expand All @@ -308,11 +372,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 14,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
".\r\n",
"----------------------------------------------------------------------\r\n",
"Ran 1 test in 131.024s\r\n",
"\r\n",
"OK\r\n"
]
}
],
"source": [
"!python -m crawler.scheduler_test SchedulerTest.test_can_fetch_page"
]
Expand All @@ -326,11 +402,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 15,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
".\r\n",
"----------------------------------------------------------------------\r\n",
"Ran 1 test in 0.000s\r\n",
"\r\n",
"OK\r\n"
]
}
],
"source": [
"!python -m crawler.scheduler_test SchedulerTest.test_init"
]
Expand Down Expand Up @@ -358,11 +446,23 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 16,
"metadata": {
"tags": []
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
".\r\n",
"----------------------------------------------------------------------\r\n",
"Ran 1 test in 0.533s\r\n",
"\r\n",
"OK\r\n"
]
}
],
"source": [
"!python -m crawler.page_fetcher_test PageFetcherTest.test_request_url"
]
Expand All @@ -380,7 +480,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 17,
"metadata": {},
"outputs": [],
"source": [
Expand All @@ -390,9 +490,22 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 18,
"metadata": {},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Simulação da extração de links da página http://www.pudim.com.br na profundidade nível 2...\r\n",
".\r\n",
"----------------------------------------------------------------------\r\n",
"Ran 1 test in 0.001s\r\n",
"\r\n",
"OK\r\n"
]
}
],
"source": [
"!python -m crawler.page_fetcher_test PageFetcherTest.test_discover_links"
]
Expand All @@ -419,21 +532,21 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"**Atividade 11 - um pequeno teste para finalizar: ** Use as sementes do seu grupo e crie abaixo um escalonador e 5 PageFetchers para extrair 30 páginas. Imprima também o tempo gasto total."
"**Atividade 11 - um pequeno teste para finalizar:** Use as sementes do seu grupo e crie abaixo um escalonador e 5 PageFetchers para extrair 30 páginas. Imprima também o tempo gasto total."
]
},
{
"cell_type": "code",
"execution_count": 7,
"execution_count": 19,
"metadata": {
"tags": []
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"output_type": "stream",
"text": [
"Tempo gasto: 120.671306\n"
"Tempo gasto: 62.269344 segundos\n"
]
}
],
Expand All @@ -445,30 +558,30 @@
"from crawler.scheduler import Scheduler\n",
"from urllib.parse import urlparse\n",
"from multiprocessing import Process\n",
"pages_fetchers = []\n",
"pages_fetchers_limit = 60\n",
"inicio = datetime.datetime.now()\n",
"\n",
"arr_str_urls_seeds = [\"http://cnn.com/\",\"https://pt.wikipedia.org/wiki/House,_M.D./\",\"https://globoesporte.globo.com/\"]\n",
"arr_urls_seeds = [(urlparse(str_url),0) for str_url in arr_str_urls_seeds]\n",
"scheduler = Scheduler(str_usr_agent=\"bifaroBot\",int_page_limit=1000,int_depth_limit=6,arr_urls_seeds=arr_urls_seeds)\n",
"\n",
"pages_fetchers = []\n",
"pages_fetchers_limit = 5\n",
"arr_str_urls_seeds = [\"http://cnn.com/\",\n",
" \"https://pt.wikipedia.org/wiki/House,_M.D./\", \"https://globoesporte.globo.com/\"]\n",
"arr_urls_seeds = [(urlparse(str_url), 0) for str_url in arr_str_urls_seeds]\n",
"scheduler = Scheduler(str_usr_agent=\"bifaroBot\", int_page_limit=30,\n",
" int_depth_limit=6, arr_urls_seeds=arr_urls_seeds)\n",
"\n",
"for a in range(0,pages_fetchers_limit):\n",
"for a in range(0, pages_fetchers_limit):\n",
" pages_fetchers.append(PageFetcher(scheduler))\n",
"\n",
"proc = []\n",
"start = datetime.datetime.now()\n",
"for pages_fetcher in pages_fetchers:\n",
" p = Process(target=pages_fetcher.run())\n",
" p.start()\n",
" proc.append(p)\n",
"\n",
"for p in proc:\n",
" p.join()\n",
"\n",
"fim = datetime.datetime.now()\n",
"print(f\"Tempo gasto: {(fim-inicio).total_seconds()}\")\n",
"with open(\"times.txt\",\"a\",encoding=\"utf-8\") as file:\n",
" file.write(f\"Quantidade escalonadores: {pages_fetchers_limit};Tempo gasto: {(fim-inicio).total_seconds()}\\n\")"
"end = datetime.datetime.now()\n",
"print(f\"Tempo gasto: {(end-start).total_seconds()} segundos\")\n"
]
},
{
Expand Down Expand Up @@ -503,9 +616,9 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8.2-final"
"version": "3.8.5"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
}
Loading

0 comments on commit d9ea9f4

Please sign in to comment.