Merge branch 'master' into zecheng_text_example_update

pyg-team · Jan 8, 2024 · 08ae8b7 · 08ae8b7
2 parents 0dc3467 + 5066869
commit 08ae8b7
Show file tree

Hide file tree

Showing 29 changed files with 396 additions and 146 deletions.
diff --git a/.github/workflows/documentation.yml b/.github/workflows/documentation.yml
@@ -20,7 +20,7 @@ jobs:
       # Skip workflow if only certain files have been changed.
       - name: Get changed files
         id: changed-files-specific
-        uses: tj-actions/changed-files@v34
+        uses: tj-actions/changed-files@v41
         with:
           files: |
             docs/**

diff --git a/.github/workflows/latest_testing.yml b/.github/workflows/latest_testing.yml
@@ -20,7 +20,7 @@ jobs:
       # Skip workflow if only certain files have been changed.
       - name: Get changed files
         id: changed-files-specific
-        uses: tj-actions/changed-files@v34
+        uses: tj-actions/changed-files@v41
         with:
           files: |
             docs/**

diff --git a/.github/workflows/prev_testing.yml b/.github/workflows/prev_testing.yml
@@ -25,7 +25,7 @@ jobs:
       # Skip workflow if only certain files have been changed.
       - name: Get changed files
         id: changed-files-specific
-        uses: tj-actions/changed-files@v34
+        uses: tj-actions/changed-files@v41
         with:
           files: |
             docs/**

diff --git a/.github/workflows/testing.yml b/.github/workflows/testing.yml
@@ -20,7 +20,7 @@ jobs:
       # Skip workflow if only certain files have been changed.
       - name: Get changed files
         id: changed-files-specific
-        uses: tj-actions/changed-files@v34
+        uses: tj-actions/changed-files@v41
         with:
           files: |
             docs/**

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -8,7 +8,7 @@ ci:
 
 repos:
   - repo: https://github.com/pre-commit/pre-commit-hooks
-    rev: v4.4.0
+    rev: v4.5.0
     hooks:
       - id: no-commit-to-branch
         name: No commits to master
@@ -27,41 +27,41 @@ repos:
           )$
 
   - repo: https://github.com/adrienverge/yamllint.git
-    rev: v1.32.0
+    rev: v1.33.0
     hooks:
       - id: yamllint
         name: Lint yaml
         args: [-d, '{extends: default, rules: {line-length: disable, document-start: disable, truthy: {level: error}, braces: {max-spaces-inside: 1}}}']
 
   - repo: https://github.com/google/yapf
-    rev: v0.40.0
+    rev: v0.40.2
     hooks:
       - id: yapf
         name: Format code
         additional_dependencies: [toml]
 
   - repo: https://github.com/pycqa/isort
-    rev: 5.12.0
+    rev: 5.13.2
     hooks:
       - id: isort
         name: Sort imports
 
   - repo: https://github.com/PyCQA/flake8
-    rev: 6.0.0
+    rev: 6.1.0
     hooks:
       - id: flake8
         name: Check PEP8
         additional_dependencies: [Flake8-pyproject]
 
   - repo: https://github.com/astral-sh/ruff-pre-commit
-    rev: v0.1.5
+    rev: v0.1.9
     hooks:
       - id: ruff
         name: Ruff formatting
         args: [--fix, --exit-non-zero-on-fix]
 
   - repo: https://github.com/pre-commit/mirrors-mypy
-    rev: v1.7.1
+    rev: v1.8.0
     hooks:
       - id: mypy
         name: Check types

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -7,6 +7,9 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/).
 ## [Unreleased]
 
 ### Added
+- Added `stype_encoder_dict` to some models ([#319](https://github.com/pyg-team/pytorch-frame/pull/319))
+
+- Added `HuggingFaceDatasetDict` ([#287](https://github.com/pyg-team/pytorch-frame/pull/287))
 
 ### Changed
 - Supported decoder embedding model in `examples/transformers_text.py` ([#333](https://github.com/pyg-team/pytorch-frame/pull/333))

diff --git a/README.md b/README.md
@@ -83,7 +83,7 @@ PyTorch Frame builds directly upon PyTorch, ensuring a smooth transition for exi
   Comes with a collection of readily-usable tabular datasets. Also supports custom datasets to solve your own problem.
   We [benchmark](https://github.com/pyg-team/pytorch-frame/blob/master/benchmark) deep tabular models against GBDTs.
 * **PyTorch integration**:
-  Integrates effortlessly with other PyTorch libraries, like [PyG](https://pyg.org/), facilitating end-to-end training of PyTorch Frame with downstream PyTorch models.
+  Integrates effortlessly with other PyTorch libraries, facilitating end-to-end training of PyTorch Frame with downstream PyTorch models. For example, by integrating with [PyG](https://pyg.org/), a PyTorch library for GNNs, we can perform deep learning over relational databases. Learn more in [RelBench](https://relbench.stanford.edu/) and [example code (WIP)](https://github.com/snap-stanford/relbench/blob/main/examples/gnn.py).
 
 ## Architecture Overview
 

diff --git a/docs/source/handling_advanced_stypes/handle_heterogeneous_stypes.rst b/docs/source/handling_advanced_stypes/handle_heterogeneous_stypes.rst
@@ -119,7 +119,7 @@ Now we can specify the :obj:`stype_encoder_dict` to a model of your choice.
         out_channels=1,
         num_layers=2,
         col_stats=dataset.col_stats,
-        col_names_dict=train_tensor_frame.col_names_dict,
+        col_names_dict=dataset.tensor_frame.col_names_dict,
         stype_encoder_dict=stype_encoder_dict,
     )
 

diff --git a/docs/source/handling_advanced_stypes/handle_text.rst b/docs/source/handling_advanced_stypes/handle_text.rst
@@ -360,8 +360,10 @@ specified by :obj:`fill_value`.
             self.model = get_peft_model(self.model, peft_config)
 
         def forward(self, feat: dict[str, MultiNestedTensor]) -> Tensor:
-            # [batch_size, batch_max_seq_len]
+            # Pad [batch_size, 1, *] into [batch_size, 1, batch_max_seq_len], then,
+            # squeeze to [batch_size, batch_max_seq_len].
             input_ids = feat["input_ids"].to_dense(fill_value=0).squeeze(dim=1)
+            # Set attention_mask of padding idx to be False
             mask = feat["attention_mask"].to_dense(fill_value=0).squeeze(dim=1)
 
             # Get text embeddings for each text tokenized column

diff --git a/docs/source/modules/datasets.rst b/docs/source/modules/datasets.rst
@@ -29,3 +29,15 @@ Synthetic Datasets
    {% for name in torch_frame.datasets.synthetic_datasets %}
      {{ name }}
    {% endfor %}
+
+Other Datasets
+--------------
+
+.. autosummary::
+   :nosignatures:
+   :toctree: ../generated
+   :template: autosummary/class.rst
+
+   {% for name in torch_frame.datasets.other_datasets %}
+     {{ name }}
+   {% endfor %}
diff --git a/examples/transformers_text.py b/examples/transformers_text.py
@@ -211,8 +211,10 @@ def __init__(self, model: str, pooling: str = "mean", lora: bool = False):
         self.pooling = pooling
 
     def forward(self, feat: dict[str, MultiNestedTensor]) -> Tensor:
-        # [batch_size, batch_max_seq_len]
+        # Pad [batch_size, 1, *] into [batch_size, 1, batch_max_seq_len], then,
+        # squeeze to [batch_size, batch_max_seq_len].
         input_ids = feat["input_ids"].to_dense(fill_value=0).squeeze(dim=1)
+        # Set attention_mask of padding idx to be False
         mask = feat["attention_mask"].to_dense(fill_value=0).squeeze(dim=1)
 
         # Get text embeddings for each text tokenized column

diff --git a/pyproject.toml b/pyproject.toml
@@ -51,6 +51,7 @@ full=[
     "optuna>=3.0.0",
     "catboost",
     "lightgbm",
+    "datasets",
 ]
 
 [project.urls]
@@ -125,22 +126,12 @@ module = [
     "torch_frame.gbdt.tuned_catboost",
     "torch_frame.gbdt.tuned_lightgbm",
     "torch_frame.gbdt.tuned_xgboost",
-    "torch_frame.nn.conv.excelformer_conv",
     "torch_frame.nn.encoder.stype_encoder",
-    "torch_frame.nn.models.excelformer",
-    "torch_frame.nn.models.ft_transformer",
-    "torch_frame.nn.models.resnet",
-    "torch_frame.nn.models.tab_transformer",
-    "torch_frame.nn.models.tabnet",
-    "torch_frame.nn.models.trompt",
     "torch_frame.testing.text_tokenizer",
     "torch_frame.transforms.base_transform",
     "torch_frame.transforms.cat_to_num_transform",
     "torch_frame.transforms.fittable_base_transform",
     "torch_frame.transforms.mutual_information_sort",
-    "torch_frame.utils.concat",
-    "torch_frame.utils.infer_stype",
-    "torch_frame.utils.io",
 ]
 
 [tool.pytest.ini_options]

diff --git a/test/nn/models/test_trompt.py b/test/nn/models/test_trompt.py
@@ -1,17 +1,35 @@
 import pytest
 
+from torch_frame import stype
 from torch_frame.data.dataset import Dataset
 from torch_frame.datasets import FakeDataset
-from torch_frame.nn import Trompt
+from torch_frame.nn import EmbeddingEncoder, LinearEncoder, Trompt
 
 
 @pytest.mark.parametrize('batch_size', [0, 5])
-def test_trompt(batch_size):
+@pytest.mark.parametrize('stype_encoder_dicts', [
+    [
+        {
+            stype.numerical: LinearEncoder(),
+            stype.categorical: EmbeddingEncoder(),
+        },
+        {
+            stype.numerical: LinearEncoder(),
+            stype.categorical: EmbeddingEncoder(),
+        },
+        {
+            stype.numerical: LinearEncoder(),
+            stype.categorical: EmbeddingEncoder(),
+        },
+    ],
+    None,
+])
+def test_trompt(batch_size, stype_encoder_dicts):
     batch_size = 10
     channels = 8
     out_channels = 1
     num_prompts = 2
-    num_layers = 6
+    num_layers = 3
     dataset: Dataset = FakeDataset(num_rows=10, with_nan=False)
     dataset.materialize()
     tensor_frame = dataset.tensor_frame[:batch_size]
@@ -22,6 +40,7 @@ def test_trompt(batch_size):
         num_layers=num_layers,
         col_stats=dataset.col_stats,
         col_names_dict=tensor_frame.col_names_dict,
+        stype_encoder_dicts=stype_encoder_dicts,
     )
     model.reset_parameters()
     out = model.forward_stacked(tensor_frame)

diff --git a/torch_frame/data/dataset.py b/torch_frame/data/dataset.py
@@ -382,7 +382,7 @@ def __init__(
                     f"split_col must only contain {set(SPLIT_TO_NUM.values())}"
                 )
         self.split_col = split_col
-        self.col_to_stype = col_to_stype
+        self.col_to_stype = col_to_stype.copy()
 
         cols = self.feat_cols + ([] if target_col is None else [target_col])
         missing_cols = set(cols) - set(df.columns)

diff --git a/torch_frame/data/multi_tensor.py b/torch_frame/data/multi_tensor.py
@@ -28,7 +28,7 @@ def __init__(
     def validate(self) -> None:
         pass
 
-    def to_dict(self) -> dict[str, Any]:
+    def to_dict(self) -> dict[str, int | Tensor]:
         r"""Serialize the object into a dictionary."""
         return {
             "num_rows": self.num_rows,

diff --git a/torch_frame/data/tensor_frame.py b/torch_frame/data/tensor_frame.py
@@ -39,7 +39,7 @@ class TensorFrame:
 
         import torch_frame
 
-        tf = torch_frame.TensorFrame({
+        tf = torch_frame.TensorFrame(
             feat_dict = {
                 # Two numerical columns:
                 torch_frame.numerical: torch.randn(10, 2),
@@ -51,7 +51,7 @@ class TensorFrame:
                 torch_frame.categorical: ['cat_1', 'cat_2', 'cat_3'],
 
             },
-        })
+        )
 
         print(len(tf))
         >>> 10

diff --git a/torch_frame/datasets/__init__.py b/torch_frame/datasets/__init__.py
@@ -15,6 +15,7 @@
 from .data_frame_benchmark import DataFrameBenchmark
 from .mercari import Mercari
 from .amazon_fine_food_reviews import AmazonFineFoodReviews
+from .huggingface_dataset import HuggingFaceDatasetDict
 
 real_world_datasets = [
     'Titanic',
@@ -37,4 +38,8 @@
     'FakeDataset',
 ]
 
-__all__ = real_world_datasets + synthetic_datasets
+other_datasets = [
+    'HuggingFaceDatasetDict',
+]
+
+__all__ = real_world_datasets + synthetic_datasets + other_datasets