Skip to content

CLN: Split into subpackages #69

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,252 changes: 20 additions & 1,232 deletions pandas_datareader/data.py

Large diffs are not rendered by default.

43 changes: 43 additions & 0 deletions pandas_datareader/famafrench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import tempfile
import numpy as np
from pandas.io.common import urlopen, ZipFile
from pandas.compat import lmap
from pandas import DataFrame

_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'


def _get_data(name):
# path of zip files
zip_file_path = '{0}/{1}_TXT.zip'.format(_URL, name)

with urlopen(zip_file_path) as url:
raw = url.read()

with tempfile.TemporaryFile() as tmpf:
tmpf.write(raw)

with ZipFile(tmpf, 'r') as zf:
data = zf.open(zf.namelist()[0]).readlines()

line_lengths = np.array(lmap(len, data))
file_edges = np.where(line_lengths == 2)[0]

datasets = {}
edges = zip(file_edges + 1, file_edges[1:])
for i, (left_edge, right_edge) in enumerate(edges):
dataset = [d.split() for d in data[left_edge:right_edge]]
if len(dataset) > 10:
ncol_raw = np.array(lmap(len, dataset))
ncol = np.median(ncol_raw)
header_index = np.where(ncol_raw == ncol - 1)[0][-1]
header = dataset[header_index]
ds_header = dataset[header_index + 1:]
# to ensure the header is unique
header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
start=1)]
index = np.array([d[0] for d in ds_header], dtype=int)
dataset = np.array([d[1:] for d in ds_header], dtype=float)
datasets[i] = DataFrame(dataset, index, columns=header)

return datasets
46 changes: 46 additions & 0 deletions pandas_datareader/fred.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import datetime as dt
from pandas.core.common import is_list_like
from pandas.io.common import urlopen
from pandas import concat, read_csv

from pandas_datareader.utils import _sanitize_dates

_URL = "http://research.stlouisfed.org/fred2/series/"


def _get_data(name, start=dt.datetime(2010, 1, 1),
end=dt.datetime.today()):
"""
Get data for the given name from the St. Louis FED (FRED).
Date format is datetime

Returns a DataFrame.

If multiple names are passed for "series" then the index of the
DataFrame is the outer join of the indicies of each series.
"""
start, end = _sanitize_dates(start, end)

if not is_list_like(name):
names = [name]
else:
names = name

urls = [_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for
n in names]

def fetch_data(url, name):
with urlopen(url) as resp:
data = read_csv(resp, index_col=0, parse_dates=True,
header=None, skiprows=1, names=["DATE", name],
na_values='.')
try:
return data.truncate(start, end)
except KeyError:
if data.ix[3].name[7:12] == 'Error':
raise IOError("Failed to get the data. Check that {0!r} is "
"a valid FRED series.".format(name))
raise
df = concat([fetch_data(url, n) for url, n in zip(urls, names)],
axis=1, join='outer')
return df
Empty file.
57 changes: 57 additions & 0 deletions pandas_datareader/google/daily.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
from pandas.io.common import urlencode
from pandas_datareader.utils import _retry_read_url
from pandas_datareader.utils import _sanitize_dates
from pandas_datareader.utils import _get_data_from

_URL = 'http://www.google.com/finance/historical?'


def _get_data(symbols=None, start=None, end=None, retry_count=3,
pause=0.001, chunksize=25):
"""
Returns DataFrame/Panel of historical stock prices from symbols, over date
range, start to end. To avoid being penalized by Google Finance servers,
pauses between downloading 'chunks' of symbols can be specified.

Parameters
----------
symbols : string, array-like object (list, tuple, Series), or DataFrame
Single stock symbol (ticker), array-like object of symbols or
DataFrame with index containing stock symbols.
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, to pause between consecutive queries of chunks. If
single value given for symbol, represents the pause between retries.
chunksize : int, default 25
Number of symbols to download consecutively before intiating pause.

Returns
-------
hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
"""
return _get_data_from(symbols, start, end, None, retry_count, pause,
chunksize, _get_data_one)


def _get_data_one(sym, start, end, interval, retry_count, pause):
"""
Get historical data for the given name from google.
Date format is datetime

Returns a DataFrame.
"""
start, end = _sanitize_dates(start, end)

# www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv
url = "%s%s" % (_URL,
urlencode({"q": sym,
"startdate": start.strftime('%b %d, ' '%Y'),
"enddate": end.strftime('%b %d, %Y'),
"output": "csv"}))
return _retry_read_url(url, retry_count, pause, 'Google')
10 changes: 10 additions & 0 deletions pandas_datareader/google/quotes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
def _get_data(symbols):
"""
Get current yahoo quote

(Should) Returns a DataFrame

ToDo: Not implemented
"""
msg = "Google Finance doesn't have this functionality - can't get quote for %s" % symbols
raise NotImplementedError(msg)
5 changes: 3 additions & 2 deletions pandas_datareader/tests/test_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@
from urllib2 import HTTPError

import pandas_datareader.data as web
from pandas_datareader.data import (
DataReader, SymbolWarning, RemoteDataError, _yahoo_codes)
from pandas_datareader.data import DataReader
from pandas_datareader.utils import SymbolWarning, RemoteDataError
from pandas_datareader.yahoo.quotes import _yahoo_codes

def _skip_if_no_lxml():
try:
Expand Down
119 changes: 119 additions & 0 deletions pandas_datareader/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
import time
import warnings
import numpy as np
import datetime as dt

from pandas import to_datetime
import pandas.compat as compat
from pandas.core.common import PandasError
from pandas import Panel, DataFrame
from pandas.io.common import urlopen
from pandas import read_csv
from pandas.compat import StringIO, bytes_to_str
from pandas.util.testing import _network_error_classes


class SymbolWarning(UserWarning):
pass

class RemoteDataError(PandasError, IOError):
pass

def _get_data_from(symbols, start, end, interval, retry_count, pause,
chunksize, src_fn):

# If a single symbol, (e.g., 'GOOG')
if isinstance(symbols, (compat.string_types, int)):
hist_data = src_fn(symbols, start, end, interval, retry_count, pause)
# Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
elif isinstance(symbols, DataFrame):
hist_data = _dl_mult_symbols(symbols.index, start, end, interval, chunksize,
retry_count, pause, src_fn)
else:
hist_data = _dl_mult_symbols(symbols, start, end, interval, chunksize,
retry_count, pause, src_fn)
return hist_data

def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause,
method):
stocks = {}
failed = []
passed = []
for sym_group in _in_chunks(symbols, chunksize):
for sym in sym_group:
try:
stocks[sym] = method(sym, start, end, interval, retry_count, pause)
passed.append(sym)
except IOError:
warnings.warn('Failed to read symbol: {0!r}, replacing with '
'NaN.'.format(sym), SymbolWarning)
failed.append(sym)

if len(passed) == 0:
raise RemoteDataError("No data fetched using "
"{0!r}".format(method.__name__))
try:
if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0:
df_na = stocks[passed[0]].copy()
df_na[:] = np.nan
for sym in failed:
stocks[sym] = df_na
return Panel(stocks).swapaxes('items', 'minor')
except AttributeError:
# cannot construct a panel with just 1D nans indicating no data
raise RemoteDataError("No data fetched using "
"{0!r}".format(method.__name__))


def _sanitize_dates(start, end):
"""
Return (datetime_start, datetime_end) tuple
if start is None - default is 2010/01/01
if end is None - default is today
"""
start = to_datetime(start)
end = to_datetime(end)
if start is None:
start = dt.datetime(2010, 1, 1)
if end is None:
end = dt.datetime.today()
return start, end

def _in_chunks(seq, size):
"""
Return sequence in 'chunks' of size defined by size
"""
return (seq[pos:pos + size] for pos in range(0, len(seq), size))

def _retry_read_url(url, retry_count, pause, name):
"""
Open url (and retry)
"""
for _ in range(retry_count):
time.sleep(pause)

# kludge to close the socket ASAP
try:
with urlopen(url) as resp:
lines = resp.read()
except _network_error_classes:
pass
else:
rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
parse_dates=True, na_values='-')[::-1]
# Yahoo! Finance sometimes does this awesome thing where they
# return 2 rows for the most recent business day
if len(rs) > 2 and rs.index[-1] == rs.index[-2]: # pragma: no cover
rs = rs[:-1]

#Get rid of unicode characters in index name.
try:
rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore')
except AttributeError:
#Python 3 string has no decode method.
rs.index.name = rs.index.name.encode('ascii', 'ignore').decode()

return rs

raise IOError("after %d tries, %s did not "
"return a 200 for url %r" % (retry_count, name, url))
Empty file.
83 changes: 83 additions & 0 deletions pandas_datareader/yahoo/actions.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import time
import csv
from pandas import to_datetime, DataFrame
from pandas.io.common import urlopen
from pandas.util.testing import _network_error_classes
from pandas.compat import StringIO, bytes_to_str

from pandas_datareader.utils import _sanitize_dates

_URL = 'http://ichart.finance.yahoo.com/x?'


def _get_data(symbol, start=None, end=None, retry_count=3, pause=0.001):
"""
Returns DataFrame of historical corporate actions (dividends and stock
splits) from symbols, over date range, start to end. All dates in the
resulting DataFrame correspond with dividend and stock split ex-dates.

Parameters
----------
sym : string with a single Single stock symbol (ticker).
start : string, (defaults to '1/1/2010')
Starting date, timestamp. Parses many different kind of date
representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
end : string, (defaults to today)
Ending date, timestamp. Same format as starting date.
retry_count : int, default 3
Number of times to retry query request.
pause : int, default 0
Time, in seconds, of the pause between retries.
"""

start, end = _sanitize_dates(start, end)
url = (_URL + 's=%s' % symbol + \
'&a=%s' % (start.month - 1) + \
'&b=%s' % start.day + \
'&c=%s' % start.year + \
'&d=%s' % (end.month - 1) + \
'&e=%s' % end.day + \
'&f=%s' % end.year + \
'&g=v')

for _ in range(retry_count):
time.sleep(pause)

try:
with urlopen(url) as resp:
lines = resp.read()
except _network_error_classes:
pass
else:
actions_index = []
actions_entries = []

for line in csv.reader(StringIO(bytes_to_str(lines))):
# Ignore lines that aren't dividends or splits (Yahoo
# add a bunch of irrelevant fields.)
if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'):
continue

action, date, value = line
if action == 'DIVIDEND':
actions_index.append(to_datetime(date))
actions_entries.append({
'action': action,
'value': float(value)
})
elif action == 'SPLIT' and ':' in value:
# Convert the split ratio to a fraction. For example a
# 4:1 split expressed as a fraction is 1/4 = 0.25.
denominator, numerator = value.split(':', 1)
split_fraction = float(numerator) / float(denominator)

actions_index.append(to_datetime(date))
actions_entries.append({
'action': action,
'value': split_fraction
})

return DataFrame(actions_entries, index=actions_index)

raise IOError("after %d tries, Yahoo! did not " \
"return a 200 for url %r" % (retry_count, url))
Loading