joeyism · nickderobertis · Aug 18, 2019 · Aug 18, 2019 · Aug 18, 2019
diff --git a/edgar/edgar.py b/edgar/edgar.py
@@ -1,6 +1,8 @@
 from lxml import html
 import requests
 
+BASE_URL = "https://www.sec.gov"
+
 class Company():
 
     def __init__(self, name, cik):
@@ -48,26 +50,93 @@ def findCompanyName(self, words):
         return possibleCompanies
 
 
+class Filing:
+    main_xpath = '//*[@id="formDiv"]/div/table/tr[2]/td[3]/a'
+
+    def __init__(self, elem):
+        self.url = BASE_URL + elem.attrib["href"]
+        self.elem = getRequest(self.url)
+
+    @property
+    def text_content(self):
+        return self._get_text_content_by_link_xpath(self.main_xpath)
+
+    @property
+    def content(self):
+        return self._get_html_by_link_xpath(self.main_xpath)
+
+    @property
+    def filing_date(self):
+        return self._get_filing_info('Filing Date')
+
+    @property
+    def accepted(self):
+        return self._get_filing_info('Accepted')
+
+    @property
+    def period_of_report(self):
+        return self._get_filing_info('Period of Report')
+
+    def sub_filing(self, sub_document, as_html = False):
+        xpath = '//*[@id="formDiv"]/div/table/tr[td[4]/text()="{sub_document}"]/td[3]/a'.format(
+            sub_document=sub_document
+        )
+        if as_html:
+            return self._get_html_by_link_xpath(xpath)
+        return self._get_text_content_by_link_xpath(xpath)
+
+    def _get_content_by_link_xpath(self, xpath):
+        url = BASE_URL + self.elem.xpath(xpath)[0].attrib["href"]
+        content = getRequest(url)
+        return content
+
+    def _get_text_content_by_link_xpath(self, xpath):
+        content = self._get_content_by_link_xpath(xpath)
+        return content.body.text_content()
+
+    def _get_html_by_link_xpath(self, xpath):
+        content = self._get_content_by_link_xpath(xpath)
+        return html.tostring(content).decode('utf8')
+
+    def _get_filing_info(self, info_str):
+        info_xpath = '//*[@id="formDiv"]//div[@class="formGrouping"]/div[preceding-sibling::div[1]/' \
+                     'text()="{info_str}"]/text()'.format(info_str=info_str)
+        return self.elem.xpath(info_xpath)[0]
+
 
 def getRequest(href):
     page = requests.get(href)
     return html.fromstring(page.content)
 
-def getDocuments(tree, noOfDocuments=1):
-    baseurl = "https://www.sec.gov"
-    elems = tree.xpath('//*[@id="documentsbutton"]')[:noOfDocuments]
-    result = []
-    for elem in elems:
-        url = baseurl + elem.attrib["href"]
-        contentPage = getRequest(url)
-        url = baseurl + contentPage.xpath('//*[@id="formDiv"]/div/table/tr[2]/td[3]/a')[0].attrib["href"]
-        filing = getRequest(url)
-        result.append(filing.body.text_content())
+
+def getDocuments(tree, sub_document=None, noOfDocuments=1, as_html=False):
+    filings = getFilings(tree, noOfDocuments=noOfDocuments)
+    if sub_document is None:
+        if as_html:
+            attr = 'content'
+        else:
+            attr = 'text_content'
+        result = [getattr(filing, attr) for filing in filings]
+    else:
+        result = [filing.sub_filing(sub_document, as_html=as_html) for filing in filings]
 
     if len(result) == 1:
         return result[0]
     return result
 
+
+def getFilings(tree, noOfDocuments=1):
+    elems = tree.xpath('//*[@id="documentsbutton"]')[:noOfDocuments]
+    return [Filing(elem) for elem in elems]
+
+
+def _get_sub_document_xpath(sub_document=None):
+    if sub_document is None:
+        return '//*[@id="formDiv"]/div/table/tr[2]/td[3]/a'
+
+    return '//*[@id="formDiv"]/div/table/tr[td[4]/text()="{sub_document}"]/td[3]/a'.format(sub_document=sub_document)
+
+
 def getCIKFromCompany(companyName):
     tree = getRequest("https://www.sec.gov/cgi-bin/browse-edgar?company=" + companyName)
     CIKList = tree.xpath('//*[@id="seriesDiv"]/table/tr[*]/td[1]/a/text()')