opendilab · puyuan1996 · Jul 4, 2024 · Dec 5, 2023 · Dec 5, 2023 · Dec 9, 2023
diff --git a/.gitignore b/.gitignore
@@ -1444,4 +1444,7 @@ events.*
 !/lzero/mcts/**/lib/*.h
 **/tb/*
 **/mcts/ctree/tests_cpp/*
-**/*tmp*
+**/*tmp*
+
+# pooltool-specific stuff
+!/assets/pooltool/**
diff --git a/README.md b/README.md
@@ -139,6 +139,7 @@ The environments and algorithms currently supported by LightZero are shown in th
 | MiniGrid      | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
 | Bsuite        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
 | Memory        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
+| SumToThree (billiards) | ---      | 🔒     | 🔒          | ✔               | 🔒         | 🔒             |
 
 <sup>(1): "✔" means that the corresponding item is finished and well-tested.</sup>
 

diff --git a/README.zh.md b/README.zh.md
@@ -127,6 +127,7 @@ LightZero 目前支持的环境及算法如下表所示：
 | MiniGrid      | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
 | Bsuite        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
 | Memory        | ---      | ✔     | ✔          | ✔               | 🔒         | 🔒             |
+| SumToThree (billiards) | ---      | 🔒     | 🔒          | ✔               | 🔒         | 🔒             |
 
 <sup>(1): "✔" 表示对应的项目已经完成并经过良好的测试。</sup>
 

diff --git a/assets/pooltool/3hits.gif b/assets/pooltool/3hits.gif
diff --git a/assets/pooltool/4hits.gif b/assets/pooltool/4hits.gif
diff --git a/assets/pooltool/cts.png b/assets/pooltool/cts.png
diff --git a/assets/pooltool/cts_zoom.png b/assets/pooltool/cts_zoom.png
diff --git a/assets/pooltool/discrete.png b/assets/pooltool/discrete.png
diff --git a/assets/pooltool/feature_planes.png b/assets/pooltool/feature_planes.png
diff --git a/assets/pooltool/largecut.gif b/assets/pooltool/largecut.gif
diff --git a/assets/pooltool/nocut.gif b/assets/pooltool/nocut.gif
diff --git a/lzero/model/common.py b/lzero/model/common.py
@@ -13,7 +13,8 @@
 import torch.nn as nn
 from ding.torch_utils import MLP, ResBlock
 from ding.utils import SequenceType
-
+import torch.nn.init as init
+import torch.nn.functional as F
 
 # use dataclass to make the output of network more convenient to use
 @dataclass
@@ -35,6 +36,31 @@ class MZNetworkOutput:
     latent_state: torch.Tensor
 
 
+
+class SimNorm(nn.Module):
+    """
+    Simplicial normalization.
+    Adapted from https://arxiv.org/abs/2204.00616.
+    """
+
+    def __init__(self, simnorm_dim):
+        super().__init__()
+        self.dim = simnorm_dim
+
+    def forward(self, x):
+        shp = x.shape
+        # Ensure that there is at least one simplex to normalize across.
+        if shp[1] != 0:
+            x = x.view(*shp[:-1], -1, self.dim)
+            x = F.softmax(x, dim=-1)
+            return x.view(*shp)
+        else:
+            return x
+
+    def __repr__(self):
+        return f"SimNorm(dim={self.dim})"
+
+
 class DownSample(nn.Module):
 
     def __init__(self, observation_shape: SequenceType, out_channels: int, activation: nn.Module = nn.ReLU(inplace=True),
@@ -140,6 +166,9 @@ def __init__(
             downsample: bool = True,
             activation: nn.Module = nn.ReLU(inplace=True),
             norm_type: str = 'BN',
+            embedding_dim: int = 256,
+            group_size: int = 8,
+            use_sim_norm: bool = False,
     ) -> None:
         """
         Overview:
@@ -174,19 +203,30 @@ def __init__(
                 self.norm = nn.BatchNorm2d(num_channels)
             elif norm_type == 'LN':
                 if downsample:
-                    self.norm = nn.LayerNorm([num_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)])
+                    self.norm = nn.LayerNorm(
+                        [num_channels, math.ceil(observation_shape[-2] / 16), math.ceil(observation_shape[-1] / 16)],
+                        eps=1e-5)
                 else:
-                    self.norm = nn.LayerNorm([num_channels, observation_shape[-2], observation_shape[-1]])
-            
+                    self.norm = nn.LayerNorm([num_channels, observation_shape[-2], observation_shape[-1]], eps=1e-5)
+
         self.resblocks = nn.ModuleList(
             [
                 ResBlock(
-                    in_channels=num_channels, activation=activation, norm_type='BN', res_type='basic', bias=False
+                    in_channels=num_channels, activation=activation, norm_type=norm_type, res_type='basic', bias=False
                 ) for _ in range(num_res_blocks)
             ]
         )
         self.activation = activation
 
+        self.use_sim_norm = use_sim_norm
+
+        if self.use_sim_norm:
+            self.embedding_dim = embedding_dim
+            self.last_linear = nn.Linear(64 * 8 * 8, self.embedding_dim, bias=False)
+            # Initialize weights using He initialization
+            init.kaiming_normal_(self.last_linear.weight, mode='fan_out', nonlinearity='relu')
+            self.sim_norm = SimNorm(simnorm_dim=group_size)
+
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Shapes:
@@ -204,20 +244,13 @@ def forward(self, x: torch.Tensor) -> torch.Tensor:
 
         for block in self.resblocks:
             x = block(x)
-        return x
 
-    def get_param_mean(self) -> float:
-        """
-        Overview:
-            Get the mean of parameters in the network for debug and visualization.
-        Returns:
-            - mean (:obj:`float`): The mean of parameters in the network.
-        """
-        mean = []
-        for name, param in self.named_parameters():
-            mean += np.abs(param.detach().cpu().numpy().reshape(-1)).tolist()
-        mean = sum(mean) / len(mean)
-        return mean
+        if self.use_sim_norm:
+            # NOTE: very important. 
+            # for atari 64,8,8 = 4096 -> 768
+            x = self.sim_norm(x)
+
+        return x
 
 
 class RepresentationNetworkMLP(nn.Module):
@@ -227,9 +260,9 @@ def __init__(
             observation_shape: int,
             hidden_channels: int = 64,
             layer_num: int = 2,
-            activation: Optional[nn.Module] = nn.ReLU(inplace=True),
-            last_linear_layer_init_zero: bool = True,
+            activation: nn.Module = nn.GELU(),
             norm_type: Optional[str] = 'BN',
+            group_size: int = 8,
     ) -> torch.Tensor:
         """
         Overview:
@@ -244,8 +277,6 @@ def __init__(
                 we don't need this module.
             - activation (:obj:`nn.Module`): The activation function used in network, defaults to nn.ReLU(). \
                 Use the inplace operation to speed up.
-            - last_linear_layer_init_zero (:obj:`bool`): Whether to initialize the last linear layer with zeros, \
-                which can provide stable zero outputs in the beginning, defaults to True.
             - norm_type (:obj:`str`): The type of normalization in networks. defaults to 'BN'.
         """
         super().__init__()
@@ -262,14 +293,18 @@ def __init__(
             # last_linear_layer_init_zero=True is beneficial for convergence speed.
             last_linear_layer_init_zero=True,
         )
+        self.sim_norm = SimNorm(simnorm_dim=group_size)
 
     def forward(self, x: torch.Tensor) -> torch.Tensor:
         """
         Shapes:
             - x (:obj:`torch.Tensor`): :math:`(B, N)`, where B is batch size, N is the length of vector observation.
             - output (:obj:`torch.Tensor`): :math:`(B, hidden_channels)`, where B is batch size.
         """
-        return self.fc_representation(x)
+        x = self.fc_representation(x)
+        x = self.sim_norm(x)
+        return x
+
 
 
 class PredictionNetwork(nn.Module):

diff --git a/requirements.txt b/requirements.txt
@@ -6,4 +6,6 @@ pympler
 bsuite
 minigrid
 moviepy
-pycolab
+pycolab
+pytest
+pooltool-billiards>=0.3.1
diff --git a/zoo/atari/config/atari_muzero_config.py b/zoo/atari/config/atari_muzero_config.py
@@ -1,7 +1,10 @@
 from easydict import EasyDict
+import torch
+device = 1
+torch.cuda.set_device(device)
 
 # options={'PongNoFrameskip-v4', 'QbertNoFrameskip-v4', 'MsPacmanNoFrameskip-v4', 'SpaceInvadersNoFrameskip-v4', 'BreakoutNoFrameskip-v4', ...}
-env_id = 'PongNoFrameskip-v4'
+env_id = 'MsPacmanNoFrameskip-v4'
 
 if env_id == 'PongNoFrameskip-v4':
     action_space_size = 6
@@ -17,21 +20,27 @@
 # ==============================================================
 # begin of the most frequently changed config specified by the user
 # ==============================================================
-collector_env_num = 8
-n_episode = 8
+# collector_env_num = 8
+# n_episode = 8
+collector_env_num = 1
+n_episode = 1
 evaluator_env_num = 3
 num_simulations = 50
-update_per_collect = 1000
+# update_per_collect = 1000
+update_per_collect = None
+model_update_ratio = 0.25
 batch_size = 256
-max_env_step = int(1e6)
-reanalyze_ratio = 0.
+# max_env_step = int(1e6)
+max_env_step = int(1e8)
+# reanalyze_ratio = 0.
+reanalyze_ratio = 1
 eps_greedy_exploration_in_collect = False
 # ==============================================================
 # end of the most frequently changed config specified by the user
 # ==============================================================
 
 atari_muzero_config = dict(
-    exp_name=f'data_mz_ctree/{env_id[:-14]}_muzero_ns{num_simulations}_upc{update_per_collect}_rr{reanalyze_ratio}_seed0',
+    exp_name=f'data_muzero_tune/{env_id[:-14]}_muzero_collect{collector_env_num}_ns{num_simulations}_upc{update_per_collect}-mur{model_update_ratio}_rr{reanalyze_ratio}_no-priority_seed0',
     env=dict(
         stop_value=int(1e6),
         env_id=env_id,
@@ -42,6 +51,13 @@
         manager=dict(shared_memory=False, ),
     ),
     policy=dict(
+        learn=dict(
+            learner=dict(
+                hook=dict(
+                    save_ckpt_after_iter=1000000,  # default is 10000
+                ),
+            ),
+        ),
         model=dict(
             observation_shape=(4, 96, 96),
             frame_stack_num=4,
@@ -65,8 +81,10 @@
             end=0.05,
             decay=int(1e5),
         ),
+        use_priority=False,  # TODO
         use_augmentation=True,
         update_per_collect=update_per_collect,
+        model_update_ratio=model_update_ratio,
         batch_size=batch_size,
         optim_type='SGD',
         lr_piecewise_constant_decay=True,
@@ -75,7 +93,8 @@
         reanalyze_ratio=reanalyze_ratio,
         ssl_loss_weight=2,  # default is 0
         n_episode=n_episode,
-        eval_freq=int(2e3),
+        # eval_freq=int(2e3),
+        eval_freq=int(1e4),
         replay_buffer_size=int(1e6),  # the size/capacity of replay_buffer, in the terms of transitions.
         collector_env_num=collector_env_num,
         evaluator_env_num=evaluator_env_num,

diff --git a/zoo/pooltool/README.md b/zoo/pooltool/README.md
@@ -0,0 +1,123 @@
+# Billiards RL
+
+Welcome to the documentation for billiards simulation within the LightZero framework. Billiards offers an intriguing learning environment for reinforcement learning due to its continuous action space, turn-based play, and the need for long-term planning and strategy formulation.
+
+## Pooltool
+
+Pooltool is a general purpose billiards simulator crafted specifically for science and engineering applications (learn more [here](https://github.com/ekiefl/pooltool)). It has been incorporated into LightZero to create diverse learning environments for billiards games.
+
+## Testing your installation
+
+Pooltool comes pre-installed with LightZero. If you are using a custom setup, follow the _pip_ install instructions [here](https://pooltool.readthedocs.io/en/latest/getting_started/install.html#install-option-1-pip).
+
+Verify pooltool is found in your python path:
+
+```bash
+python -c "import pooltool; print(pooltool.__version__)"
+```
+
+Further test your installation by opening the interactive interface:
+
+```bash
+# Unix
+run_pooltool
+
+# Windows
+run_pooltool.bat
+```
+
+(For instructions on how to play, check out the [Getting Started tutorial](https://pooltool.readthedocs.io/en/latest/getting_started/interface.html))
+
+## Supported Games
+
+Currently supports the following games:
+
+1. **Sum to Three**: A simplified billiards game designed to make learning easier for agents.
+2. **Standard Billiards Games** (planned for future updates): Including 8-ball, 9-ball, and snooker.
+
+The rest of the document provides details for each supported game.
+
+## Game 1: Sum to Three
+
+Standard billiards games like 8-ball, 9-ball, and snooker have complex rulesets which make learning more difficult.
+
+In contrast, _sum to three_ is a fictitious billiards game with a simple ruleset.
+
+### Rules
+
+1. The game is played on a table with no pockets
+1. There are 2 balls: a cue ball and an object ball
+1. The player must hit the object ball with the cue ball
+1. The player scores a point if the number of times a ball hits a cushion is 3
+1. The player takes 10 shots, and their final score is the number of points they achieve
+
+For example, this is a successful shot because there are three ball-cushion collisions:
+
+<img src="../../assets/pooltool/3hits.gif" width="600" />
+
+This is an unsuccessful shot because there are four ball-cushion collisions:
+
+<img src="../../assets/pooltool/4hits.gif" width="600" />
+
+### Observation / Action Spaces
+
+Continuous and discrete observatwon spaces are supported. The continuous observation space uses the coordinates of the two balls as the observation. The discrete observation space is based on configurable image-based feature planes.
+
+In general, when an agent strikes a cue ball, the cue stick is described by 5 continuous parameters:
+
+```
+V0 : positive float
+    What initial velocity does the cue strike the ball?
+phi : float (degrees)
+    The direction you strike the ball
+theta : float (degrees)
+    How elevated is the cue from the playing surface, in degrees?
+a : float
+    How much side english should be put on? -1 being rightmost side of ball, +1 being
+    leftmost side of ball
+b : float
+    How much vertical english should be put on? -1 being bottom-most side of ball, +1 being
+    topmost side of ball
+```
+
+Since sum to three is a simple game, only a reduced action space with 2 parameters is supported:
+
+1. V0: The speed of the cue stick. Increasing this means the cue ball travels further
+1. cut angle: The angle that the cue ball hits the object ball with
+
+For example, in this shot, the cut angle is -70 (hitting the left side of the object ball):
+
+<img src="../../assets/pooltool/largecut.gif" width="600" />
+
+For example, in this shot, the cut angle is 0 (head-on collision):
+
+<img src="../../assets/pooltool/nocut.gif" width="600" />
+
+Based on the game dimensions, a suitable bound for the action parameters is used: [0.3, 3] for speed and [-70, 70] for cut angle.
+
+### Experiments
+
+You can conduct experiments using different observation spaces:
+
+1. **Continuous Observation Space Experiment**:
+    - Run the experiment with:
+      ```bash
+      python ./zoo/pooltool/sum_to_three/config/sum_to_three_config.py
+      ```
+    - Results will be saved in `./data_pooltool_sampled_efficientzero/image-obs`.
+
+2. **Discrete Observation Space Experiment**:
+    - Run the experiment with:
+      ```bash
+      python ./zoo/pooltool/sum_to_three/config/sum_to_three_image_config.py
+      ```
+    - Modify the feature plane information by editing `./zoo/pooltool/sum_to_three/config/feature_plane_config.json`. View the usage example in `./zoo/pooltool/image_representation.py` for details about the feature plane content.
+    - Results will be saved in `./data_pooltool_sampled_efficientzero/vector-obs`.
+
+### Results
+
+TODO(puyuan1996)
+
+## Game 2: 8-ball / 9-ball / 3-cushion / snooker
+
+What billiards game would you like to see next?