Skip to content

Commit 3e79a08

Browse files
b4thesunriseDefangCui
andauthored
Add Crypto dataset from coingecko (#733)
* add crypto symbols collectors * add crypto data collector * add crypto symbols collectors * add crypto data collector * solver region and source problem * fix merge * fix merge * clean all cn information Co-authored-by: DefangCui <170007807@pku.edu.cn>
1 parent dfc0ed3 commit 3e79a08

File tree

4 files changed

+373
-0
lines changed

4 files changed

+373
-0
lines changed
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
# Collect Crypto Data
2+
3+
> *Please pay **ATTENTION** that the data is collected from [Coingecko](https://www.coingecko.com/en/api) and the data might not be perfect. We recommend users to prepare their own data if they have high-quality dataset. For more information, users can refer to the [related document](https://qlib.readthedocs.io/en/latest/component/data.html#converting-csv-format-into-qlib-format)*
4+
5+
## Requirements
6+
7+
```bash
8+
pip install -r requirements.txt
9+
```
10+
11+
## Usage of the dataset
12+
> *Crypto dateset only support Data retrieval function but not support backtest function due to the lack of OHLC data.*
13+
14+
## Collector Data
15+
16+
17+
### Crypto Data
18+
19+
#### 1d from Coingecko
20+
21+
```bash
22+
23+
# download from https://api.coingecko.com/api/v3/
24+
python collector.py download_data --source_dir ~/.qlib/crypto_data/source/1d --start 2015-01-01 --end 2021-11-30 --delay 1 --interval 1d
25+
26+
# normalize
27+
python collector.py normalize_data --source_dir ~/.qlib/crypto_data/source/1d --normalize_dir ~/.qlib/crypto_data/source/1d_nor --interval 1d --date_field_name date
28+
29+
# dump data
30+
cd qlib/scripts
31+
python dump_bin.py dump_all --csv_path ~/.qlib/crypto_data/source/1d_nor --qlib_dir ~/.qlib/qlib_data/crypto_data --freq day --date_field_name date --include_fields prices,total_volumes,market_caps
32+
33+
```
34+
35+
### using data
36+
37+
```python
38+
import qlib
39+
from qlib.data import D
40+
41+
qlib.init(provider_uri="~/.qlib/qlib_data/crypto_data")
42+
df = D.features(D.instruments(market="all"), ["$prices", "$total_volumes","$market_caps"], freq="day")
43+
```
44+
45+
46+
### Help
47+
```bash
48+
python collector.py collector_data --help
49+
```
50+
51+
## Parameters
52+
53+
- interval: 1d
54+
- delay: 1
Lines changed: 278 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,278 @@
1+
import abc
2+
import sys
3+
import datetime
4+
from abc import ABC
5+
from pathlib import Path
6+
7+
import fire
8+
import requests
9+
import pandas as pd
10+
from loguru import logger
11+
from dateutil.tz import tzlocal
12+
13+
CUR_DIR = Path(__file__).resolve().parent
14+
sys.path.append(str(CUR_DIR.parent.parent))
15+
from data_collector.base import BaseCollector, BaseNormalize, BaseRun
16+
from data_collector.utils import get_cg_crypto_symbols
17+
18+
from pycoingecko import CoinGeckoAPI
19+
from time import mktime
20+
from datetime import datetime as dt
21+
import time
22+
23+
24+
class CryptoCollector(BaseCollector):
25+
def __init__(
26+
self,
27+
save_dir: [str, Path],
28+
start=None,
29+
end=None,
30+
interval="1d",
31+
max_workers=1,
32+
max_collector_count=2,
33+
delay=1, # delay need to be one
34+
check_data_length: int = None,
35+
limit_nums: int = None,
36+
):
37+
"""
38+
39+
Parameters
40+
----------
41+
save_dir: str
42+
crypto save dir
43+
max_workers: int
44+
workers, default 4
45+
max_collector_count: int
46+
default 2
47+
delay: float
48+
time.sleep(delay), default 0
49+
interval: str
50+
freq, value from [1min, 1d], default 1min
51+
start: str
52+
start datetime, default None
53+
end: str
54+
end datetime, default None
55+
check_data_length: int
56+
check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
57+
limit_nums: int
58+
using for debug, by default None
59+
"""
60+
super(CryptoCollector, self).__init__(
61+
save_dir=save_dir,
62+
start=start,
63+
end=end,
64+
interval=interval,
65+
max_workers=max_workers,
66+
max_collector_count=max_collector_count,
67+
delay=delay,
68+
check_data_length=check_data_length,
69+
limit_nums=limit_nums,
70+
)
71+
72+
self.init_datetime()
73+
74+
def init_datetime(self):
75+
if self.interval == self.INTERVAL_1min:
76+
self.start_datetime = max(self.start_datetime, self.DEFAULT_START_DATETIME_1MIN)
77+
elif self.interval == self.INTERVAL_1d:
78+
pass
79+
else:
80+
raise ValueError(f"interval error: {self.interval}")
81+
82+
self.start_datetime = self.convert_datetime(self.start_datetime, self._timezone)
83+
self.end_datetime = self.convert_datetime(self.end_datetime, self._timezone)
84+
85+
@staticmethod
86+
def convert_datetime(dt: [pd.Timestamp, datetime.date, str], timezone):
87+
try:
88+
dt = pd.Timestamp(dt, tz=timezone).timestamp()
89+
dt = pd.Timestamp(dt, tz=tzlocal(), unit="s")
90+
except ValueError as e:
91+
pass
92+
return dt
93+
94+
@property
95+
@abc.abstractmethod
96+
def _timezone(self):
97+
raise NotImplementedError("rewrite get_timezone")
98+
99+
@staticmethod
100+
def get_data_from_remote(symbol, interval, start, end):
101+
error_msg = f"{symbol}-{interval}-{start}-{end}"
102+
try:
103+
cg = CoinGeckoAPI()
104+
data = cg.get_coin_market_chart_by_id(id=symbol, vs_currency="usd", days="max")
105+
_resp = pd.DataFrame(columns=["date"] + list(data.keys()))
106+
_resp["date"] = [dt.fromtimestamp(mktime(time.localtime(x[0] / 1000))) for x in data["prices"]]
107+
for key in data.keys():
108+
_resp[key] = [x[1] for x in data[key]]
109+
_resp["date"] = pd.to_datetime(_resp["date"])
110+
_resp["date"] = [x.date() for x in _resp["date"]]
111+
_resp = _resp[(_resp["date"] < pd.to_datetime(end).date()) & (_resp["date"] > pd.to_datetime(start).date())]
112+
if _resp.shape[0] != 0:
113+
_resp = _resp.reset_index()
114+
if isinstance(_resp, pd.DataFrame):
115+
return _resp.reset_index()
116+
except Exception as e:
117+
logger.warning(f"{error_msg}:{e}")
118+
119+
def get_data(
120+
self, symbol: str, interval: str, start_datetime: pd.Timestamp, end_datetime: pd.Timestamp
121+
) -> [pd.DataFrame]:
122+
def _get_simple(start_, end_):
123+
self.sleep()
124+
_remote_interval = interval
125+
return self.get_data_from_remote(
126+
symbol,
127+
interval=_remote_interval,
128+
start=start_,
129+
end=end_,
130+
)
131+
132+
if interval == self.INTERVAL_1d:
133+
_result = _get_simple(start_datetime, end_datetime)
134+
else:
135+
raise ValueError(f"cannot support {interval}")
136+
return _result
137+
138+
139+
class CryptoCollector1d(CryptoCollector, ABC):
140+
def get_instrument_list(self):
141+
logger.info("get coingecko crypto symbols......")
142+
symbols = get_cg_crypto_symbols()
143+
logger.info(f"get {len(symbols)} symbols.")
144+
return symbols
145+
146+
def normalize_symbol(self, symbol):
147+
return symbol
148+
149+
@property
150+
def _timezone(self):
151+
return "Asia/Shanghai"
152+
153+
154+
class CryptoNormalize(BaseNormalize):
155+
DAILY_FORMAT = "%Y-%m-%d"
156+
157+
@staticmethod
158+
def normalize_crypto(
159+
df: pd.DataFrame,
160+
calendar_list: list = None,
161+
date_field_name: str = "date",
162+
symbol_field_name: str = "symbol",
163+
):
164+
if df.empty:
165+
return df
166+
df = df.copy()
167+
df.set_index(date_field_name, inplace=True)
168+
df.index = pd.to_datetime(df.index)
169+
df = df[~df.index.duplicated(keep="first")]
170+
if calendar_list is not None:
171+
df = df.reindex(
172+
pd.DataFrame(index=calendar_list)
173+
.loc[
174+
pd.Timestamp(df.index.min()).date() : pd.Timestamp(df.index.max()).date()
175+
+ pd.Timedelta(hours=23, minutes=59)
176+
]
177+
.index
178+
)
179+
df.sort_index(inplace=True)
180+
181+
df.index.names = [date_field_name]
182+
return df.reset_index()
183+
184+
def normalize(self, df: pd.DataFrame) -> pd.DataFrame:
185+
df = self.normalize_crypto(df, self._calendar_list, self._date_field_name, self._symbol_field_name)
186+
return df
187+
188+
189+
class CryptoNormalize1d(CryptoNormalize):
190+
def _get_calendar_list(self):
191+
return None
192+
193+
194+
class Run(BaseRun):
195+
def __init__(self, source_dir=None, normalize_dir=None, max_workers=1, interval="1d"):
196+
"""
197+
198+
Parameters
199+
----------
200+
source_dir: str
201+
The directory where the raw data collected from the Internet is saved, default "Path(__file__).parent/source"
202+
normalize_dir: str
203+
Directory for normalize data, default "Path(__file__).parent/normalize"
204+
max_workers: int
205+
Concurrent number, default is 1
206+
interval: str
207+
freq, value from [1min, 1d], default 1d
208+
"""
209+
super().__init__(source_dir, normalize_dir, max_workers, interval)
210+
211+
@property
212+
def collector_class_name(self):
213+
return f"CryptoCollector{self.interval}"
214+
215+
@property
216+
def normalize_class_name(self):
217+
return f"CryptoNormalize{self.interval}"
218+
219+
@property
220+
def default_base_dir(self) -> [Path, str]:
221+
return CUR_DIR
222+
223+
def download_data(
224+
self,
225+
max_collector_count=2,
226+
delay=0,
227+
start=None,
228+
end=None,
229+
interval="1d",
230+
check_data_length: int = None,
231+
limit_nums=None,
232+
):
233+
"""download data from Internet
234+
235+
Parameters
236+
----------
237+
max_collector_count: int
238+
default 2
239+
delay: float
240+
time.sleep(delay), default 0
241+
interval: str
242+
freq, value from [1min, 1d], default 1d, currently only supprot 1d
243+
start: str
244+
start datetime, default "2000-01-01"
245+
end: str
246+
end datetime, default ``pd.Timestamp(datetime.datetime.now() + pd.Timedelta(days=1))``
247+
check_data_length: int # if this param useful?
248+
check data length, if not None and greater than 0, each symbol will be considered complete if its data length is greater than or equal to this value, otherwise it will be fetched again, the maximum number of fetches being (max_collector_count). By default None.
249+
limit_nums: int
250+
using for debug, by default None
251+
252+
Examples
253+
---------
254+
# get daily data
255+
$ python collector.py download_data --source_dir ~/.qlib/crypto_data/source/1d --start 2015-01-01 --end 2021-11-30 --delay 1 --interval 1d
256+
"""
257+
258+
super(Run, self).download_data(max_collector_count, delay, start, end, interval, check_data_length, limit_nums)
259+
260+
def normalize_data(self, date_field_name: str = "date", symbol_field_name: str = "symbol"):
261+
"""normalize data
262+
263+
Parameters
264+
----------
265+
date_field_name: str
266+
date field name, default date
267+
symbol_field_name: str
268+
symbol field name, default symbol
269+
270+
Examples
271+
---------
272+
$ python collector.py normalize_data --source_dir ~/.qlib/crypto_data/source/1d --normalize_dir ~/.qlib/crypto_data/source/1d_nor --interval 1d --date_field_name date
273+
"""
274+
super(Run, self).normalize_data(date_field_name, symbol_field_name)
275+
276+
277+
if __name__ == "__main__":
278+
fire.Fire(Run)
Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,8 @@
1+
loguru
2+
fire
3+
requests
4+
numpy
5+
pandas
6+
tqdm
7+
lxml
8+
pycoingecko

scripts/data_collector/utils.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,7 @@
1919
from tqdm import tqdm
2020
from functools import partial
2121
from concurrent.futures import ProcessPoolExecutor
22+
from pycoingecko import CoinGeckoAPI
2223

2324
HS_SYMBOLS_URL = "http://app.finance.ifeng.com/hq/list.php?type=stock_a&class={s_type}"
2425

@@ -42,6 +43,7 @@
4243
_US_SYMBOLS = None
4344
_IN_SYMBOLS = None
4445
_EN_FUND_SYMBOLS = None
46+
_CG_CRYPTO_SYMBOLS = None
4547
_CALENDAR_MAP = {}
4648

4749
# NOTE: Until 2020-10-20 20:00:00
@@ -377,6 +379,37 @@ def _get_eastmoney():
377379
return _EN_FUND_SYMBOLS
378380

379381

382+
def get_cg_crypto_symbols(qlib_data_path: [str, Path] = None) -> list:
383+
"""get crypto symbols in coingecko
384+
385+
Returns
386+
-------
387+
crypto symbols in given exchanges list of coingecko
388+
"""
389+
global _CG_CRYPTO_SYMBOLS
390+
391+
@deco_retry
392+
def _get_coingecko():
393+
try:
394+
cg = CoinGeckoAPI()
395+
resp = pd.DataFrame(cg.get_coins_markets(vs_currency="usd"))
396+
except:
397+
raise ValueError("request error")
398+
try:
399+
_symbols = resp["id"].to_list()
400+
except Exception as e:
401+
logger.warning(f"request error: {e}")
402+
raise
403+
return _symbols
404+
405+
if _CG_CRYPTO_SYMBOLS is None:
406+
_all_symbols = _get_coingecko()
407+
408+
_CG_CRYPTO_SYMBOLS = sorted(set(_all_symbols))
409+
410+
return _CG_CRYPTO_SYMBOLS
411+
412+
380413
def symbol_suffix_to_prefix(symbol: str, capital: bool = True) -> str:
381414
"""symbol suffix to prefix
382415

0 commit comments

Comments
 (0)