Skip to content

Added webscraper for Thrift Savings Plan #157

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Feb 6, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions docs/source/remote_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@ Currently the following sources are supported:
- :ref:`World Bank<remote_data.wb>`
- :ref:`OECD<remote_data.oecd>`
- :ref:`Eurostat<remote_data.eurostat>`
- :ref:`Thrift Savings Plan<remote_data.tsp>`

It should be noted, that various sources support different kinds of data, so not all sources implement the same methods and the data elements returned might also differ.

Expand Down Expand Up @@ -422,3 +423,16 @@ reconnect after waiting a few minutes.
import pandas_datareader.data as web
ed = web.DataReader('daily', 'edgar-index', '1998-05-18', '1998-05-18')
ed[:5]

.. _remote_data.tsp:

TSP Fund Data

Download mutual fund index prices for the TSP.

.. ipython:: python

import pandas_datareader.tsp as tsp
tspreader = tsp.TSPReader(start='2015-10-1', end='2015-12-31')
tspreader.read()

1 change: 1 addition & 0 deletions docs/source/whatsnew.rst
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ What's New

These are new features and improvements of note in each release.

.. include:: whatsnew/v0.2.3.txt
.. include:: whatsnew/v0.2.2.txt
.. include:: whatsnew/v0.2.1.txt
.. include:: whatsnew/v0.2.0.txt
31 changes: 31 additions & 0 deletions docs/source/whatsnew/v0.2.3.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
.. _whatsnew_023:

v0.2.3 (XXX)
----------------------------

This is a minor release from 0.2.2 and includes new features.


Highlights include:


.. contents:: What's new in v0.2.3
:local:
:backlinks: none

.. _whatsnew_023.enhancements:

New features
~~~~~~~~~~~~

- ``DataReader`` now supports pulling data for the TSP.

.. _whatsnew_023.api_breaking:

Backwards incompatible API changes
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

.. _whatsnew_023.bug_fixes:

Bug Fixes
~~~~~~~~~
14 changes: 11 additions & 3 deletions pandas_datareader/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,22 @@ def _read_url_as_StringIO(self, url, params=None):
Open url (and retry)
"""
response = self._get_response(url, params=params)
text = self._sanitize_response(response)
out = StringIO()
if isinstance(response.content, compat.binary_type):
out.write(bytes_to_str(response.content))
if isinstance(text, compat.binary_type):
out.write(bytes_to_str(text))
else:
out.write(response.content)
out.write(text)
out.seek(0)
return out

@staticmethod
def _sanitize_response(response):
"""
Hook to allow subclasses to clean up response data
"""
return response.content

def _get_response(self, url, params=None):
""" send raw HTTP request to get requests.Response from the specified url
Parameters
Expand Down
32 changes: 32 additions & 0 deletions pandas_datareader/tests/test_base.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import nose
import pandas.util.testing as tm
import pandas_datareader.base as base


class TestBaseReader(tm.TestCase):
def test_valid_retry_count(self):
with tm.assertRaises(ValueError):
base._BaseReader([], retry_count='stuff')
with tm.assertRaises(ValueError):
base._BaseReader([], retry_count=-1)

def test_invalid_url(self):
with tm.assertRaises(NotImplementedError):
base._BaseReader([]).url

def test_invalid_format(self):
with tm.assertRaises(NotImplementedError):
b = base._BaseReader([])
b._format = 'IM_NOT_AN_IMPLEMENTED_TYPE'
b._read_one_data('a', None)


class TestDailyBaseReader(tm.TestCase):
def test_get_params(self):
with tm.assertRaises(NotImplementedError):
b = base._DailyBaseReader()
b._get_params()

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
29 changes: 29 additions & 0 deletions pandas_datareader/tests/test_tsp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import nose
import pandas.util.testing as tm
import datetime as dt

import pandas_datareader.tsp as tsp


class TestTSPFunds(tm.TestCase):
def test_get_allfunds(self):
tspdata = tsp.TSPReader(start='2015-11-2', end='2015-11-2').read()

assert len(tspdata == 1)

assert round(tspdata['I Fund'][dt.date(2015, 11, 2)], 5) == 25.0058

def test_sanitize_response(self):
class response(object):
pass
r = response()
r.text = ' , '
ret = tsp.TSPReader._sanitize_response(r)
assert ret == ''
r.text = ' a,b '
ret = tsp.TSPReader._sanitize_response(r)
assert ret == 'a,b'

if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)
64 changes: 64 additions & 0 deletions pandas_datareader/tsp.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,64 @@
from pandas_datareader.base import _BaseReader


class TSPReader(_BaseReader):

"""
Returns DataFrame of historical TSP fund prices from symbols, over date
range, start to end.

Parameters
----------
symbols : string, array-like object (list, tuple, Series), or DataFrame
Single stock symbol (ticker), array-like object of symbols or
DataFrame with index containing stock symbols.
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, to pause between consecutive queries of chunks. If
single value given for symbol, represents the pause between retries.
session : Session, default None
requests.sessions.Session instance to be used
"""

def __init__(self,
symbols=['Linc', 'L2020', 'L2030', 'L2040', 'L2050', 'G', 'F', 'C', 'S', 'I'],
start=None, end=None, retry_count=3, pause=0.001,
session=None):
super(TSPReader, self).__init__(symbols=symbols,
start=start, end=end,
retry_count=retry_count,
pause=pause, session=session)
self._format = 'string'

@property
def url(self):
return 'https://www.tsp.gov/InvestmentFunds/FundPerformance/index.html'

def read(self):
""" read one data from specified URL """
df = super(TSPReader, self).read()
df.columns = map(lambda x: x.strip(), df.columns)
return df

@property
def params(self):
return {'startdate': self.start.strftime('%m/%d/%Y'),
'enddate': self.end.strftime('%m/%d/%Y'),
'fundgroup': self.symbols,
'whichButton': 'CSV'}

@staticmethod
def _sanitize_response(response):
"""
Clean up the response string
"""
text = response.text.strip()
if text[-1] == ',':
return text[0:-1]
return text
2 changes: 1 addition & 1 deletion setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ def readme():
return f.read()

INSTALL_REQUIRES = (
['pandas', 'requests', 'requests-file', 'requests-ftp']
['pandas', 'requests>=2.3.0', 'requests-file', 'requests-ftp']
)

setup(
Expand Down