Skip to content

[ENH] Fama/French #56

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 1 commit into from
Aug 29, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions docs/source/remote_data.rst
Original file line number Diff line number Diff line change
Expand Up @@ -155,14 +155,18 @@ FRED
Fama/French
===========

Dataset names are listed at `Fama/French Data Library
Access datasets from the `Fama/French Data Library
<http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/data_library.html>`__.
The ``get_available_datasets`` function returns a list of all available datasets.

.. ipython:: python

from pandas_datareader.famafrench import get_available_datasets
import pandas_datareader.data as web
ip = web.DataReader("5_Industry_Portfolios", "famafrench")
ip[4].ix[192607]
len(get_available_datasets())
ds = web.DataReader("5_Industry_Portfolios", "famafrench")
print(ds['DESCR'])
ds[4].ix['1926-07']

.. _remote_data.wb:

Expand Down
125 changes: 95 additions & 30 deletions pandas_datareader/famafrench.py
Original file line number Diff line number Diff line change
@@ -1,43 +1,108 @@
import tempfile
import numpy as np
import re
import datetime as dt
from pandas.io.common import urlopen, ZipFile
from pandas.compat import lmap
from pandas import DataFrame
from pandas.compat import lmap, StringIO
from pandas import read_csv, to_datetime

_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'

_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/'
_URL_PREFIX = 'ftp/'
_URL_SUFFIX = '_CSV.zip'

def _get_data(name):
# path of zip files
zip_file_path = '{0}/{1}_TXT.zip'.format(_URL, name)

with urlopen(zip_file_path) as url:
raw = url.read()
def get_available_datasets():
"""
Get the list of datasets available from the Fama/French data library.

Returns
-------
A list of valid inputs for get_data_famafrench.
"""
try:
from lxml.html import parse
except ImportError:
raise ImportError("Please install lxml if you want to use the "
"get_datasets_famafrench function")

root = parse(_URL + 'data_library.html')

l = filter(lambda x: x.startswith(_URL_PREFIX) and x.endswith(_URL_SUFFIX),
[e.attrib['href'] for e in root.findall('.//a') if 'href' in e.attrib])

return lmap(lambda x: x[len(_URL_PREFIX):-len(_URL_SUFFIX)], l)


def _download_data_famafrench(name):
url = ''.join([_URL, _URL_PREFIX, name, _URL_SUFFIX])
with urlopen(url) as socket:
raw = socket.read()

with tempfile.TemporaryFile() as tmpf:
tmpf.write(raw)

with ZipFile(tmpf, 'r') as zf:
data = zf.open(zf.namelist()[0]).readlines()

line_lengths = np.array(lmap(len, data))
file_edges = np.where(line_lengths == 2)[0]

datasets = {}
edges = zip(file_edges + 1, file_edges[1:])
for i, (left_edge, right_edge) in enumerate(edges):
dataset = [d.split() for d in data[left_edge:right_edge]]
if len(dataset) > 10:
ncol_raw = np.array(lmap(len, dataset))
ncol = np.median(ncol_raw)
header_index = np.where(ncol_raw == ncol - 1)[0][-1]
header = dataset[header_index]
ds_header = dataset[header_index + 1:]
# to ensure the header is unique
header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
start=1)]
index = np.array([d[0] for d in ds_header], dtype=int)
dataset = np.array([d[1:] for d in ds_header], dtype=float)
datasets[i] = DataFrame(dataset, index, columns=header)
data = zf.open(zf.namelist()[0]).read().decode()

return data


def _parse_date_famafrench(x):
x = x.strip()
try: return dt.datetime.strptime(x, '%Y%m')
except: pass
return to_datetime(x)


def _get_data(name):
"""
Get data for the given name from the Fama/French data library.

For annual and monthly data, index is a pandas.PeriodIndex, otherwise
it's a pandas.DatetimeIndex.

Returns
-------
df : a dictionary of pandas.DataFrame. Tables are accessed by integer keys.
See df['DESCR'] for a description of the dataset
"""
params = {'index_col': 0,
'parse_dates': [0],
'date_parser': _parse_date_famafrench}

# headers in these files are not valid
if name.endswith('_Breakpoints'):
c = ['<=0', '>0'] if name.find('-') > -1 else ['Count']
r = list(range(0, 105, 5))
params['names'] = ['Date'] + c + list(zip(r, r[1:]))
params['skiprows'] = 1 if name != 'Prior_2-12_Breakpoints' else 3

doc_chunks, tables = [], []
data = _download_data_famafrench(name)
for chunk in data.split(2 * '\r\n'):
if len(chunk) < 800:
doc_chunks.append(chunk.replace('\r\n', ' ').strip())
else:
tables.append(chunk)

datasets, table_desc = {}, []
for i, src in enumerate(tables):
match = re.search('^\s*,', src, re.M) # the table starts there
start = 0 if not match else match.start()

df = read_csv(StringIO('Date' + src[start:]), **params)
try: df = df.to_period(df.index.inferred_freq[:1])
except: pass
datasets[i] = df

title = src[:start].replace('\r\n', ' ').strip()
shape = '({0} rows x {1} cols)'.format(*df.shape)
table_desc.append('{0} {1}'.format(title, shape).strip())

descr = '{0}\n{1}\n\n'.format(name.replace('_', ' '), len(name) * '-')
if doc_chunks: descr += ' '.join(doc_chunks).replace(2 * ' ', ' ') + '\n\n'

table_descr = map(lambda x: '{0:3} : {1}'.format(*x), enumerate(table_desc))

datasets['DESCR'] = descr + '\n'.join(table_descr)
return datasets
8 changes: 0 additions & 8 deletions pandas_datareader/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -461,14 +461,6 @@ def test_read_fred(self):
vix = DataReader("VIXCLS", "fred")
assert isinstance(vix, DataFrame)

def test_read_famafrench(self):
for name in ("F-F_Research_Data_Factors",
"F-F_Research_Data_Factors_weekly", "6_Portfolios_2x3",
"F-F_ST_Reversal_Factor", "F-F_Momentum_Factor"):
ff = DataReader(name, "famafrench")
assert ff
assert isinstance(ff, dict)

def test_not_implemented(self):
self.assertRaises(NotImplementedError, DataReader, "NA", "NA")

Expand Down
34 changes: 34 additions & 0 deletions pandas_datareader/tests/test_famafrench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
import nose
import pandas.util.testing as tm

import pandas_datareader.data as web
from pandas_datareader.famafrench import get_available_datasets


class TestFamaFrench(tm.TestCase):
def test_get_data(self):
keys = [
'F-F_Research_Data_Factors', 'F-F_ST_Reversal_Factor',
'6_Portfolios_2x3', 'Portfolios_Formed_on_ME',
'Prior_2-12_Breakpoints', 'ME_Breakpoints',
]
for name in keys:
ff = web.DataReader(name, 'famafrench')
assert 'DESCR' in ff
assert len(ff) > 1

def test_get_available_datasets(self):
# _skip_if_no_lxml()
l = get_available_datasets()
assert len(l) > 100

def test_index(self):
ff = web.DataReader('F-F_Research_Data_Factors', 'famafrench')
assert ff[0].index.freq == 'M'
assert ff[1].index.freq == 'A-DEC'



if __name__ == '__main__':
nose.runmodule(argv=[__file__, '-vvs', '-x', '--pdb', '--pdb-failure'],
exit=False)