pydata · davidastephens · Aug 23, 2015 · Aug 21, 2015
diff --git a/pandas_datareader/data.py b/pandas_datareader/data.py
diff --git a/pandas_datareader/famafrench.py b/pandas_datareader/famafrench.py
@@ -0,0 +1,43 @@
+import tempfile
+import numpy as np
+from pandas.io.common import urlopen, ZipFile
+from pandas.compat import lmap
+from pandas import DataFrame
+
+_URL = 'http://mba.tuck.dartmouth.edu/pages/faculty/ken.french/ftp'
+
+
+def _get_data(name):
+    # path of zip files
+    zip_file_path = '{0}/{1}_TXT.zip'.format(_URL, name)
+
+    with urlopen(zip_file_path) as url:
+        raw = url.read()
+
+    with tempfile.TemporaryFile() as tmpf:
+        tmpf.write(raw)
+
+        with ZipFile(tmpf, 'r') as zf:
+            data = zf.open(zf.namelist()[0]).readlines()
+
+    line_lengths = np.array(lmap(len, data))
+    file_edges = np.where(line_lengths == 2)[0]
+
+    datasets = {}
+    edges = zip(file_edges + 1, file_edges[1:])
+    for i, (left_edge, right_edge) in enumerate(edges):
+        dataset = [d.split() for d in data[left_edge:right_edge]]
+        if len(dataset) > 10:
+            ncol_raw = np.array(lmap(len, dataset))
+            ncol = np.median(ncol_raw)
+            header_index = np.where(ncol_raw == ncol - 1)[0][-1]
+            header = dataset[header_index]
+            ds_header = dataset[header_index + 1:]
+            # to ensure the header is unique
+            header = ['{0} {1}'.format(j, hj) for j, hj in enumerate(header,
+                                                                     start=1)]
+            index = np.array([d[0] for d in ds_header], dtype=int)
+            dataset = np.array([d[1:] for d in ds_header], dtype=float)
+            datasets[i] = DataFrame(dataset, index, columns=header)
+
+    return datasets
diff --git a/pandas_datareader/fred.py b/pandas_datareader/fred.py
@@ -0,0 +1,46 @@
+import datetime as dt
+from pandas.core.common import is_list_like
+from pandas.io.common import urlopen
+from pandas import concat, read_csv
+
+from pandas_datareader.utils import _sanitize_dates
+
+_URL = "http://research.stlouisfed.org/fred2/series/"
+
+
+def _get_data(name, start=dt.datetime(2010, 1, 1),
+                  end=dt.datetime.today()):
+    """
+    Get data for the given name from the St. Louis FED (FRED).
+    Date format is datetime
+
+    Returns a DataFrame.
+
+    If multiple names are passed for "series" then the index of the
+    DataFrame is the outer join of the indicies of each series.
+    """
+    start, end = _sanitize_dates(start, end)
+
+    if not is_list_like(name):
+        names = [name]
+    else:
+        names = name
+
+    urls = [_URL + '%s' % n + '/downloaddata/%s' % n + '.csv' for
+            n in names]
+
+    def fetch_data(url, name):
+        with urlopen(url) as resp:
+            data = read_csv(resp, index_col=0, parse_dates=True,
+                            header=None, skiprows=1, names=["DATE", name],
+                            na_values='.')
+        try:
+            return data.truncate(start, end)
+        except KeyError:
+            if data.ix[3].name[7:12] == 'Error':
+                raise IOError("Failed to get the data. Check that {0!r} is "
+                              "a valid FRED series.".format(name))
+            raise
+    df = concat([fetch_data(url, n) for url, n in zip(urls, names)],
+                axis=1, join='outer')
+    return df
diff --git a/pandas_datareader/google/__init__.py b/pandas_datareader/google/__init__.py
diff --git a/pandas_datareader/google/daily.py b/pandas_datareader/google/daily.py
@@ -0,0 +1,57 @@
+from pandas.io.common import urlencode
+from pandas_datareader.utils import _retry_read_url
+from pandas_datareader.utils import _sanitize_dates
+from pandas_datareader.utils import _get_data_from
+
+_URL = 'http://www.google.com/finance/historical?'
+
+
+def _get_data(symbols=None, start=None, end=None, retry_count=3,
+                    pause=0.001, chunksize=25):
+    """
+    Returns DataFrame/Panel of historical stock prices from symbols, over date
+    range, start to end. To avoid being penalized by Google Finance servers,
+    pauses between downloading 'chunks' of symbols can be specified.
+
+    Parameters
+    ----------
+    symbols : string, array-like object (list, tuple, Series), or DataFrame
+        Single stock symbol (ticker), array-like object of symbols or
+        DataFrame with index containing stock symbols.
+    start : string, (defaults to '1/1/2010')
+        Starting date, timestamp. Parses many different kind of date
+        representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
+    end : string, (defaults to today)
+        Ending date, timestamp. Same format as starting date.
+    retry_count : int, default 3
+        Number of times to retry query request.
+    pause : int, default 0
+        Time, in seconds, to pause between consecutive queries of chunks. If
+        single value given for symbol, represents the pause between retries.
+    chunksize : int, default 25
+        Number of symbols to download consecutively before intiating pause.
+
+    Returns
+    -------
+    hist_data : DataFrame (str) or Panel (array-like object, DataFrame)
+    """
+    return _get_data_from(symbols, start, end, None, retry_count, pause,
+                          chunksize, _get_data_one)
+
+
+def _get_data_one(sym, start, end, interval, retry_count, pause):
+    """
+    Get historical data for the given name from google.
+    Date format is datetime
+
+    Returns a DataFrame.
+    """
+    start, end = _sanitize_dates(start, end)
+
+    # www.google.com/finance/historical?q=GOOG&startdate=Jun+9%2C+2011&enddate=Jun+8%2C+2013&output=csv
+    url = "%s%s" % (_URL,
+                    urlencode({"q": sym,
+                               "startdate": start.strftime('%b %d, ' '%Y'),
+                               "enddate": end.strftime('%b %d, %Y'),
+                               "output": "csv"}))
+    return _retry_read_url(url, retry_count, pause, 'Google')
diff --git a/pandas_datareader/google/quotes.py b/pandas_datareader/google/quotes.py
@@ -0,0 +1,10 @@
+def _get_data(symbols):
+    """
+    Get current yahoo quote
+
+    (Should) Returns a DataFrame
+
+    ToDo: Not implemented
+    """
+    msg = "Google Finance doesn't have this functionality - can't get quote for %s" % symbols
+    raise NotImplementedError(msg)
diff --git a/pandas_datareader/tests/test_data.py b/pandas_datareader/tests/test_data.py
@@ -24,8 +24,9 @@
     from urllib2 import HTTPError
 
 import pandas_datareader.data as web
-from pandas_datareader.data import (
-    DataReader, SymbolWarning, RemoteDataError, _yahoo_codes)
+from pandas_datareader.data import DataReader
+from pandas_datareader.utils import SymbolWarning, RemoteDataError
+from pandas_datareader.yahoo.quotes import _yahoo_codes
 
 def _skip_if_no_lxml():
     try:

diff --git a/pandas_datareader/utils.py b/pandas_datareader/utils.py
@@ -0,0 +1,119 @@
+import time
+import warnings
+import numpy as np
+import datetime as dt
+
+from pandas import to_datetime
+import pandas.compat as compat
+from pandas.core.common import PandasError
+from pandas import Panel, DataFrame
+from pandas.io.common import urlopen
+from pandas import read_csv
+from pandas.compat import StringIO, bytes_to_str
+from pandas.util.testing import _network_error_classes
+
+
+class SymbolWarning(UserWarning):
+    pass
+
+class RemoteDataError(PandasError, IOError):
+    pass
+
+def _get_data_from(symbols, start, end, interval, retry_count, pause,
+                   chunksize, src_fn):
+
+    # If a single symbol, (e.g., 'GOOG')
+    if isinstance(symbols, (compat.string_types, int)):
+        hist_data = src_fn(symbols, start, end, interval, retry_count, pause)
+    # Or multiple symbols, (e.g., ['GOOG', 'AAPL', 'MSFT'])
+    elif isinstance(symbols, DataFrame):
+        hist_data = _dl_mult_symbols(symbols.index, start, end, interval, chunksize,
+                                     retry_count, pause, src_fn)
+    else:
+        hist_data = _dl_mult_symbols(symbols, start, end, interval, chunksize,
+                                     retry_count, pause, src_fn)
+    return hist_data
+
+def _dl_mult_symbols(symbols, start, end, interval, chunksize, retry_count, pause,
+                     method):
+    stocks = {}
+    failed = []
+    passed = []
+    for sym_group in _in_chunks(symbols, chunksize):
+        for sym in sym_group:
+            try:
+                stocks[sym] = method(sym, start, end, interval, retry_count, pause)
+                passed.append(sym)
+            except IOError:
+                warnings.warn('Failed to read symbol: {0!r}, replacing with '
+                              'NaN.'.format(sym), SymbolWarning)
+                failed.append(sym)
+
+    if len(passed) == 0:
+        raise RemoteDataError("No data fetched using "
+                              "{0!r}".format(method.__name__))
+    try:
+        if len(stocks) > 0 and len(failed) > 0 and len(passed) > 0:
+            df_na = stocks[passed[0]].copy()
+            df_na[:] = np.nan
+            for sym in failed:
+                stocks[sym] = df_na
+        return Panel(stocks).swapaxes('items', 'minor')
+    except AttributeError:
+        # cannot construct a panel with just 1D nans indicating no data
+        raise RemoteDataError("No data fetched using "
+                              "{0!r}".format(method.__name__))
+
+
+def _sanitize_dates(start, end):
+    """
+    Return (datetime_start, datetime_end) tuple
+    if start is None - default is 2010/01/01
+    if end is None - default is today
+    """
+    start = to_datetime(start)
+    end = to_datetime(end)
+    if start is None:
+        start = dt.datetime(2010, 1, 1)
+    if end is None:
+        end = dt.datetime.today()
+    return start, end
+
+def _in_chunks(seq, size):
+    """
+    Return sequence in 'chunks' of size defined by size
+    """
+    return (seq[pos:pos + size] for pos in range(0, len(seq), size))
+
+def _retry_read_url(url, retry_count, pause, name):
+    """
+    Open url (and retry)
+    """
+    for _ in range(retry_count):
+        time.sleep(pause)
+
+        # kludge to close the socket ASAP
+        try:
+            with urlopen(url) as resp:
+                lines = resp.read()
+        except _network_error_classes:
+            pass
+        else:
+            rs = read_csv(StringIO(bytes_to_str(lines)), index_col=0,
+                          parse_dates=True, na_values='-')[::-1]
+            # Yahoo! Finance sometimes does this awesome thing where they
+            # return 2 rows for the most recent business day
+            if len(rs) > 2 and rs.index[-1] == rs.index[-2]:  # pragma: no cover
+                rs = rs[:-1]
+
+            #Get rid of unicode characters in index name.
+            try:
+                rs.index.name = rs.index.name.decode('unicode_escape').encode('ascii', 'ignore')
+            except AttributeError:
+                #Python 3 string has no decode method.
+                rs.index.name = rs.index.name.encode('ascii', 'ignore').decode()
+
+            return rs
+
+    raise IOError("after %d tries, %s did not "
+                  "return a 200 for url %r" % (retry_count, name, url))
diff --git a/pandas_datareader/yahoo/__init__.py b/pandas_datareader/yahoo/__init__.py
diff --git a/pandas_datareader/yahoo/actions.py b/pandas_datareader/yahoo/actions.py
@@ -0,0 +1,83 @@
+import time
+import csv
+from pandas import to_datetime, DataFrame
+from pandas.io.common import urlopen
+from pandas.util.testing import _network_error_classes
+from pandas.compat import StringIO, bytes_to_str
+
+from pandas_datareader.utils import _sanitize_dates
+
+_URL = 'http://ichart.finance.yahoo.com/x?'
+
+
+def _get_data(symbol, start=None, end=None, retry_count=3, pause=0.001):
+    """
+    Returns DataFrame of historical corporate actions (dividends and stock
+    splits) from symbols, over date range, start to end. All dates in the
+    resulting DataFrame correspond with dividend and stock split ex-dates.
+
+    Parameters
+    ----------
+        sym : string with a single Single stock symbol (ticker).
+        start : string, (defaults to '1/1/2010')
+                Starting date, timestamp. Parses many different kind of date
+                representations (e.g., 'JAN-01-2010', '1/1/10', 'Jan, 1, 1980')
+        end : string, (defaults to today)
+                Ending date, timestamp. Same format as starting date.
+        retry_count : int, default 3
+                Number of times to retry query request.
+        pause : int, default 0
+                Time, in seconds, of the pause between retries.
+    """
+
+    start, end = _sanitize_dates(start, end)
+    url = (_URL + 's=%s' % symbol + \
+                '&a=%s' % (start.month - 1) + \
+                '&b=%s' % start.day + \
+                '&c=%s' % start.year + \
+                '&d=%s' % (end.month - 1) + \
+                '&e=%s' % end.day + \
+                '&f=%s' % end.year + \
+                '&g=v')
+
+    for _ in range(retry_count):
+        time.sleep(pause)
+
+        try:
+            with urlopen(url) as resp:
+                lines = resp.read()
+        except _network_error_classes:
+            pass
+        else:
+            actions_index = []
+            actions_entries = []
+
+            for line in csv.reader(StringIO(bytes_to_str(lines))):
+                # Ignore lines that aren't dividends or splits (Yahoo
+                # add a bunch of irrelevant fields.)
+                if len(line) != 3 or line[0] not in ('DIVIDEND', 'SPLIT'):
+                    continue
+
+                action, date, value = line
+                if action == 'DIVIDEND':
+                    actions_index.append(to_datetime(date))
+                    actions_entries.append({
+                        'action': action,
+                        'value': float(value)
+                    })
+                elif action == 'SPLIT' and ':' in value:
+                    # Convert the split ratio to a fraction. For example a
+                    # 4:1 split expressed as a fraction is 1/4 = 0.25.
+                    denominator, numerator = value.split(':', 1)
+                    split_fraction = float(numerator) / float(denominator)
+
+                    actions_index.append(to_datetime(date))
+                    actions_entries.append({
+                        'action': action,
+                        'value': split_fraction
+                    })
+
+            return DataFrame(actions_entries, index=actions_index)
+
+    raise IOError("after %d tries, Yahoo! did not " \
+                                "return a 200 for url %r" % (retry_count, url))