forked from sec-edgar/sec-edgar
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler.py
executable file
·159 lines (120 loc) · 5.51 KB
/
crawler.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- coding:utf-8 -*-
# This script will download all the 10-K, 10-Q and 8-K
# provided that of company symbol and its cik code.
import requests
import os
import errno
from bs4 import BeautifulSoup
from config import DEFAULT_DATA_PATH
class SecCrawler():
def __init__(self):
self.hello = "Welcome to Sec Cralwer!"
print("Path of the directory where data will be saved: " + DEFAULT_DATA_PATH)
def make_directory(self, company_code, cik, priorto, filing_type):
# Making the directory to save comapny filings
path = os.path.join(DEFAULT_DATA_PATH, company_code, cik, filing_type)
if not os.path.exists(path):
try:
os.makedirs(path)
except OSError as exception:
if exception.errno != errno.EEXIST:
raise
def save_in_directory(self, company_code, cik, priorto, doc_list,
doc_name_list, filing_type):
# Save every text document into its respective folder
for j in range(len(doc_list)):
base_url = doc_list[j]
r = requests.get(base_url)
data = r.text
path = os.path.join(DEFAULT_DATA_PATH, company_code, cik,
filing_type, doc_name_list[j])
with open(path, "a+") as f:
f.write(data.encode('ascii', 'ignore'))
def filing_10Q(self, company_code, cik, priorto, count):
self.make_directory(company_code, cik, priorto, '10-Q')
# generate the url to crawl
base_url = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="+str(cik)+"&type=10-Q&dateb="+str(priorto)+"&owner=exclude&output=xml&count="+str(count)
print ("started 10-Q " + str(company_code))
r = requests.get(base_url)
data = r.text
# get doc list data
doc_list, doc_name_list = self.create_document_list(data)
try:
self.save_in_directory(company_code, cik, priorto, doc_list, doc_name_list, '10-Q')
except Exception as e:
print (str(e))
print ("Successfully downloaded all the files")
def filing_10K(self, company_code, cik, priorto, count):
self.make_directory(company_code,cik, priorto, '10-K')
# generate the url to crawl
base_url = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="+str(cik)+"&type=10-K&dateb="+str(priorto)+"&owner=exclude&output=xml&count="+str(count)
print ("started 10-K " + str(company_code))
r = requests.get(base_url)
data = r.text
# get doc list data
doc_list, doc_name_list = self.create_document_list(data)
try:
self.save_in_directory(company_code, cik, priorto, doc_list, doc_name_list, '10-K')
except Exception as e:
print (str(e))
print ("Successfully downloaded all the files")
def filing_8K(self, company_code, cik, priorto, count):
try:
self.make_directory(company_code,cik, priorto, '8-K')
except Exception as e:
print (str(e))
# generate the url to crawl
base_url = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="+str(cik)+"&type=8-K&dateb="+str(priorto)+"&owner=exclude&output=xml&count="+str(count)
print ("started 8-K" + str(company_code))
r = requests.get(base_url)
data = r.text
# get doc list data
doc_list, doc_name_list = self.create_document_list(data)
try:
self.save_in_directory(company_code, cik, priorto, doc_list, doc_name_list, '8-K')
except Exception as e:
print (str(e))
print ("Successfully downloaded all the files")
def filing_13F(self, company_code, cik, priorto, count):
try:
self.make_directory(company_code, cik, priorto, '13-F')
except Exception as e:
print (str(e))
# generate the url to crawl
base_url = "http://www.sec.gov/cgi-bin/browse-edgar?action=getcompany&CIK="+str(cik)+"&type=13F&dateb="+str(priorto)+"&owner=exclude&output=xml&count="+str(count)
print ("started 10-Q "+ str(company_code))
r = requests.get(base_url)
data = r.text
doc_list, doc_name_list = self.create_document_list(data)
try:
self.save_in_directory(company_code, cik, priorto, doc_list,
doc_name_list, '13-F')
except Exception as e:
print (str(e))
print ("Successfully downloaded all the files")
def create_document_list(self, data):
# parse fetched data using beatifulsoup
soup = BeautifulSoup(data)
# store the link in the list
link_list = list()
# If the link is .htm convert it to .html
for link in soup.find_all('filinghref'):
url = link.string
if link.string.split(".")[len(link.string.split("."))-1] == "htm":
url += "l"
link_list.append(url)
link_list_final = link_list
print ("Number of files to download {0}".format(len(link_list_final)))
print ("Starting download....")
# List of url to the text documents
doc_list = list()
# List of document names
doc_name_list = list()
# Get all the doc
for k in range(len(link_list_final)):
required_url = link_list_final[k].replace('-index.html', '')
txtdoc = required_url + ".txt"
docname = txtdoc.split("/")[-1]
doc_list.append(txtdoc)
doc_name_list.append(docname)
return doc_list, doc_name_list