Merge

microsoft · Nov 19, 2020 · df406d5 · df406d5
2 parents 21c0dae + 4a0394c
commit df406d5
Show file tree

Hide file tree

Showing 27 changed files with 1,856 additions and 495 deletions.
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
@@ -52,7 +52,7 @@ jobs:
 
     - name: Test data downloads and examples
       run: |
-        python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
+        python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --interval 1d --region cn
         # cd examples
         # estimator -c estimator/estimator_config.yaml
         # jupyter nbconvert --execute estimator/analyze_from_estimator.ipynb --to html
diff --git a/README.md b/README.md
@@ -91,7 +91,7 @@ Also, users can install ``Qlib`` by the source code according to the following s
 ## Data Preparation
 Load and prepare data by running the following code:
   ```bash
-  python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
+  python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
   ```
 
 This dataset is created by public data collected by [crawler scripts](scripts/data_collector/), which have been released in

diff --git a/docs/component/data.rst b/docs/component/data.rst
@@ -34,7 +34,7 @@ Qlib Format Dataset
 
 .. code-block:: bash
 
-    python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
 
 After running the above command, users can find china-stock data in Qlib format in the ``~/.qlib/csv_data/cn_data`` directory.
 
@@ -59,7 +59,7 @@ Supposed that users prepare their CSV format data in the directory ``~/.qlib/csv
 
 .. code-block:: bash
 
-    python scripts/dump_bin.py dump --csv_path  ~/.qlib/csv_data/my_data --qlib_dir ~/.qlib/qlib_data/my_data --include_fields open,close,high,low,volume,factor
+    python scripts/dump_bin.py dump_all --csv_path  ~/.qlib/csv_data/my_data --qlib_dir ~/.qlib/qlib_data/my_data --include_fields open,close,high,low,volume,factor
 
 After conversion, users can find their Qlib format data in the directory `~/.qlib/qlib_data/my_data`.
 

diff --git a/docs/introduction/quick.rst b/docs/introduction/quick.rst
@@ -40,7 +40,7 @@ Load and prepare data by running the following code:
 
 .. code-block::
 
-    python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
+    python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
 
 This dataset is created by public data collected by crawler scripts in ``scripts/data_collector/``, which have been released in the same repository. Users could create the same dataset with it.
 

diff --git a/docs/start/initialization.rst b/docs/start/initialization.rst
@@ -14,7 +14,7 @@ Please follow the steps below to initialize ``Qlib``.
 - Download and prepare the Data: execute the following command to download stock data. Please pay `attention` that the data is collected from `Yahoo Finance <https://finance.yahoo.com/lookup>`_ and the data might not be perfect. We recommend users to prepare their own data if they have high-quality datasets. Please refer to `Data  <../component/data.html#converting-csv-format-into-qlib-format>` for more information about customized dataset.
     .. code-block:: bash
     
-        python scripts/get_data.py qlib_data_cn --target_dir ~/.qlib/qlib_data/cn_data
+        python scripts/get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
     Please refer to `Data Preparation  <../component/data.html#data-preparation>`_ for more information about `get_data.py`,
 
 

diff --git a/qlib/data/dataset/handler.py b/qlib/data/dataset/handler.py
@@ -56,7 +56,24 @@ def __init__(
         end_time=None,
         data_loader: Tuple[dict, str, DataLoader] = None,
         init_data=True,
+        fetch_orig=True,
     ):
+        """
+        Parameters
+        ----------
+        instruments :
+            The stock list to retrive
+        start_time :
+            start_time of the original data
+        end_time :
+            end_time of the original data
+        data_loader : Tuple[dict, str, DataLoader]
+            data loader to load the data
+        init_data :
+            intialize the original data in the constructor
+        fetch_orig : bool
+            Return the original data instead of copy if possible
+        """
         # Set logger
         self.logger = get_module_logger("DataHandler")
 
@@ -72,6 +89,7 @@ def __init__(
         self.instruments = instruments
         self.start_time = start_time
         self.end_time = end_time
+        self.fetch_orig = fetch_orig
         if init_data:
             with TimeInspector.logt("Init data"):
                 self.init()
@@ -138,7 +156,7 @@ def fetch(
         -------
         pd.DataFrame:
         """
-        df = fetch_df_by_index(self._data, selector, level)
+        df = fetch_df_by_index(self._data, selector, level, fetch_orig=self.fetch_orig)
         df = self._fetch_df_by_col(df, col_set)
         if squeeze:
             # squeeze columns
@@ -269,8 +287,10 @@ def __init__(
         for pname in "infer_processors", "learn_processors":
             for proc in locals()[pname]:
                 getattr(self, pname).append(
-                    init_instance_by_config(proc, processor_module, accept_types=(processor_module.Processor,))
-                )
+                    init_instance_by_config(
+                        proc,
+                        None if (isinstance(data_loader, dict) and "module_path" in data_loader) else data_loader_module,
+                        accept_types=processor_module.Processor))
 
         self.process_type = process_type
         super().__init__(instruments, start_time, end_time, data_loader, **kwargs)
@@ -354,15 +374,16 @@ def init(self, init_type: str = IT_FIT_SEQ, enable_cache: bool = False):
         # init raw data
         super().init(enable_cache=enable_cache)
 
-        if init_type == DataHandlerLP.IT_FIT_IND:
-            self.fit()
-            self.process_data()
-        elif init_type == DataHandlerLP.IT_LS:
-            self.process_data()
-        elif init_type == DataHandlerLP.IT_FIT_SEQ:
-            self.fit_process_data()
-        else:
-            raise NotImplementedError(f"This type of input is not supported")
+        with TimeInspector.logt("fit & process data"):
+            if init_type == DataHandlerLP.IT_FIT_IND:
+                self.fit()
+                self.process_data()
+            elif init_type == DataHandlerLP.IT_LS:
+                self.process_data()
+            elif init_type == DataHandlerLP.IT_FIT_SEQ:
+                self.fit_process_data()
+            else:
+                raise NotImplementedError(f"This type of input is not supported")
 
         # TODO: Be able to cache handler data. Save the memory for data processing
 
@@ -396,7 +417,7 @@ def fetch(
         pd.DataFrame:
         """
         df = self._get_df_by_key(data_key)
-        df = fetch_df_by_index(df, selector, level)
+        df = fetch_df_by_index(df, selector, level, fetch_orig=self.fetch_orig)
         return self._fetch_df_by_col(df, col_set)
 
     def get_cols(self, col_set=DataHandler.CS_ALL, data_key: str = DK_I) -> list:

diff --git a/qlib/data/dataset/utils.py b/qlib/data/dataset/utils.py
@@ -32,7 +32,7 @@ def get_level_index(df: pd.DataFrame, level=Union[str, int]) -> int:
 
 
 def fetch_df_by_index(
-    df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int]
+    df: pd.DataFrame, selector: Union[pd.Timestamp, slice, str, list], level: Union[str, int], fetch_orig=True,
 ) -> pd.DataFrame:
     """
     fetch data from `data` with `selector` and `level`
@@ -52,6 +52,11 @@ def fetch_df_by_index(
     idx_slc = (selector, slice(None, None))
     if get_level_index(df, level) == 1:
         idx_slc = idx_slc[1], idx_slc[0]
-    return df.loc[
-        pd.IndexSlice[idx_slc],
-    ]  # This could be faster than df.loc(axis=0)[idx_slc]
+    if fetch_orig:
+        for slc in idx_slc:
+            if slc != slice(None, None):
+                return df.loc[pd.IndexSlice[idx_slc],]
+        else:
+            return df
+    else:
+        return df.loc[pd.IndexSlice[idx_slc],]
diff --git a/qlib/workflow/utils.py b/qlib/workflow/utils.py
@@ -5,9 +5,9 @@
 from . import R
 from .recorder import Recorder
 from ..log import get_module_logger
-
 logger = get_module_logger("workflow", "INFO")
 
+
 # function to handle the experiment when unusual program ending occurs
 def experiment_exit_handler():
     """
@@ -31,9 +31,11 @@ def experiment_exception_hook(type, value, tb):
     value: Exception's value
     tb: Exception's traceback
     """
-    logger.error("An exception has been raised.")
+    logger.error(f"An exception has been raised[{type.__name__}: {value}].")
+
+    # Same as original format
     traceback.print_tb(tb)
-    print(f"{type}: {value}")
+    print(f"{type.__name__}: {value}")
 
     R.end_exp(recorder_status=Recorder.STATUS_FA)
 

diff --git a/scripts/README.md b/scripts/README.md
@@ -0,0 +1,61 @@
+
+- [Download Qlib Data](#Download-Qlib-Data)
+  - [Download CN Data](#Download-CN-Data)
+  - [Downlaod US Data](#Downlaod-US-Data)
+  - [Download CN Simple Data](#Download-CN-Simple-Data)
+  - [Help](#Help)
+- [Using in Qlib](#Using-in-Qlib)
+  - [US data](#US-data)
+  - [CN data](#CN-data)
+
+
+## Download Qlib Data
+
+
+### Download CN Data
+
+```bash
+python get_data.py qlib_data --target_dir ~/.qlib/qlib_data/cn_data --region cn
+```
+
+### Downlaod US Data
+
+> The US stock code contains 'PRN', and the directory cannot be created on Windows system: https://superuser.com/questions/613313/why-cant-we-make-con-prn-null-folder-in-windows
+
+```bash
+python get_data.py qlib_data --target_dir ~/.qlib/qlib_data/us_data --region us
+```
+
+### Download CN Simple Data
+
+```bash
+python get_data.py qlib_data --name qlib_data_simple --target_dir ~/.qlib/qlib_data/cn_data --region cn
+```
+
+### Help
+
+```bash
+python get_data.py qlib_data --help
+```
+
+## Using in Qlib
+> For more information: https://qlib.readthedocs.io/en/latest/start/initialization.html
+
+
+### US data
+
+```python
+import qlib
+from qlib.config import REG_US
+provider_uri = "~/.qlib/qlib_data/us_data"  # target_dir
+qlib.init(provider_uri=provider_uri, region=REG_US)
+```
+
+### CN data
+
+```python
+import qlib
+from qlib.config import REG_CN
+provider_uri = "~/.qlib/qlib_data/cn_data"  # target_dir
+qlib.init(provider_uri=provider_uri, region=REG_CN)
+```