-
Notifications
You must be signed in to change notification settings - Fork 165
/
empresas_generator.py
69 lines (55 loc) · 2.06 KB
/
empresas_generator.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
# -*- coding: utf-8 -*- #
import json
import os
import sys
from urllib.request import urlopen
try:
from slugify import slugify
except ImportError:
print("Este programa requer python-slugify. Por favor, instale-o usando:\npython3 -m pip install python-slugify\n")
sys.exit(1)
# Diretório onde serão gerados os arquivos JSON
PAGE_PATH = 'content/empresas/'
# Arquivo que será consumido para gerar os arquivos JSON
EMPRESAS_FILE = 'https://raw.githubusercontent.com/pythonbrasil/pyBusinesses-BR/master/README.md'
EMPRESAS_LOGO_PATH = 'https://raw.githubusercontent.com/pythonbrasil/pyBusinesses-BR/master/'
def scrapping_empresas():
file = urlopen(EMPRESAS_FILE)
file = file.read().decode(encoding='utf-8')
region = state = city = ''
empresas = []
for line in file.split('\n'):
if line.startswith('## '):
region = line[2:].strip()
elif line.startswith('### '):
state = line[3:].strip()
elif line.startswith('#### '):
city = line[4:].strip()
elif line.startswith('!') and region and state and city:
parts = line.split('|')
site = parts[2].split('(')[1].strip().strip(')')
name = parts[1].strip()
logo = EMPRESAS_LOGO_PATH + parts[0].split(
'(')[1].strip().strip(')')
empresas.append({
'nome': name,
'regiao': region,
'estado': state,
'cidade': city,
'site': site,
'logo': logo,
})
return empresas
def main():
for empresa in scrapping_empresas():
filename = '{0}-{1}.json'.format(
slugify(empresa['nome']), slugify(empresa['cidade']))
if not os.path.exists(PAGE_PATH):
os.makedirs(PAGE_PATH)
file_path = os.path.join(PAGE_PATH, filename)
with open(file_path, mode='w', encoding='utf-8') as file:
json.dump(empresa, file)
print("Gerado: {}".format(file_path))
if __name__ == '__main__':
main()