meta-pytorch
diff --git a/‎torchrec/distributed/embeddingbag.py‎
Lines changed: 2 additions & 2 deletions b/‎torchrec/distributed/embeddingbag.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎torchrec/distributed/quant_embedding_kernel.py‎
Lines changed: 0 additions & 7 deletions b/‎torchrec/distributed/quant_embedding_kernel.py‎
Lines changed: 0 additions & 7 deletions
diff --git a/‎torchrec/distributed/quant_state.py‎
Lines changed: 45 additions & 107 deletions b/‎torchrec/distributed/quant_state.py‎
Lines changed: 45 additions & 107 deletions
diff --git a/‎torchrec/distributed/sharding/cw_sharding.py‎
Lines changed: 22 additions & 4 deletions b/‎torchrec/distributed/sharding/cw_sharding.py‎
Lines changed: 22 additions & 4 deletions
diff --git a/‎torchrec/distributed/sharding/rw_sharding.py‎
Lines changed: 13 additions & 2 deletions b/‎torchrec/distributed/sharding/rw_sharding.py‎
Lines changed: 13 additions & 2 deletions
diff --git a/‎torchrec/distributed/sharding/tw_sharding.py‎
Lines changed: 14 additions & 3 deletions b/‎torchrec/distributed/sharding/tw_sharding.py‎
Lines changed: 14 additions & 3 deletions
@@ -287,7 +287,7 @@ def create_sharding_infos_by_sharding(
                 embedding_names=embedding_names,
                 weight_init_max=config.weight_init_max,
                 weight_init_min=config.weight_init_min,
-                pruning_indices_remapping=config.pruning_indices_remapping,
+                num_embeddings_post_pruning=config.num_embeddings_post_pruning,
             ),
             param_sharding=parameter_sharding,
             param=param,
@@ -402,7 +402,7 @@ def create_sharding_infos_by_sharding_device_group(
                     embedding_names=embedding_names,
                     weight_init_max=config.weight_init_max,
                     weight_init_min=config.weight_init_min,
-                    pruning_indices_remapping=config.pruning_indices_remapping,
+                    num_embeddings_post_pruning=config.num_embeddings_post_pruning,
                 ),
                 param_sharding=parameter_sharding,
                 param=param,
 
@@ -214,11 +214,6 @@ def __init__(
             fused_params
         )
 
-        index_remapping = [
-            table.pruning_indices_remapping for table in config.embedding_tables
-        ]
-        if all(v is None for v in index_remapping):
-            index_remapping = None
         self._runtime_device: torch.device = _get_runtime_device(device, config)
         # 16 for CUDA, 1 for others like CPU and MTIA.
         self._tbe_row_alignment: int = 16 if self._runtime_device.type == "cuda" else 1
@@ -244,8 +239,6 @@ def __init__(
                     )
                 ],
                 device=device,
-                # pyre-ignore
-                index_remapping=index_remapping,
                 pooling_mode=self._pooling,
                 feature_table_map=self._feature_table_map,
                 row_alignment=self._tbe_row_alignment,
 
@@ -9,6 +9,7 @@
 
 import copy
 from dataclasses import dataclass
+from functools import partial
 from typing import Any, Dict, List, Mapping, Optional, Tuple, TypeVar, Union
 
 import torch
@@ -46,6 +47,47 @@ def _append_table_shard(
     d[table_name].append(shard)
 
 
+def post_state_dict_hook(
+    # Union["ShardedQuantEmbeddingBagCollection", "ShardedQuantEmbeddingCollection"]
+    # pyre-ignore [24]
+    module: ShardedEmbeddingModule,
+    destination: Dict[str, torch.Tensor],
+    prefix: str,
+    _local_metadata: Dict[str, Any],
+    tables_weights_prefix: str,  # "embedding_bags" or "embeddings"
+) -> None:
+    for (
+        table_name,
+        sharded_t,
+    ) in module._table_name_to_sharded_tensor.items():
+        destination[f"{prefix}{tables_weights_prefix}.{table_name}.weight"] = sharded_t
+
+    for sfx, dict_sharded_t, dict_t_list in [
+        (
+            "weight_qscale",
+            module._table_name_to_sharded_tensor_qscale,
+            module._table_name_to_tensors_list_qscale,
+        ),
+        (
+            "weight_qbias",
+            module._table_name_to_sharded_tensor_qbias,
+            module._table_name_to_tensors_list_qbias,
+        ),
+    ]:
+        for (
+            table_name,
+            sharded_t,
+        ) in dict_sharded_t.items():
+            destination[f"{prefix}{tables_weights_prefix}.{table_name}.{sfx}"] = (
+                sharded_t
+            )
+        for (
+            table_name,
+            t_list,
+        ) in dict_t_list.items():
+            destination[f"{prefix}{tables_weights_prefix}.{table_name}.{sfx}"] = t_list
+
+
 class ShardedQuantEmbeddingModuleState(
     ShardedEmbeddingModule[CompIn, DistOut, Out, ShrdCtx]
 ):
@@ -82,17 +124,6 @@ def _initialize_torch_state(  # noqa: C901
         ] = {}
         self._table_name_to_tensors_list_qbias: Dict[str, List[torch.Tensor]] = {}
 
-        # pruning_index_remappings
-        self._table_name_to_local_shards_pruning_index_remappings: Dict[
-            str, List[Shard]
-        ] = {}
-        self._table_name_to_sharded_tensor_pruning_index_remappings: Dict[
-            str, Union[torch.Tensor, ShardedTensorBase]
-        ] = {}
-        self._table_name_to_tensors_list_pruning_index_remappings: Dict[
-            str, List[torch.Tensor]
-        ] = {}
-
         for tbe, config in tbes.items():
             for (tbe_split_w, tbe_split_qscale, tbe_split_qbias), table in zip(
                 tbe.split_embedding_weights_with_scale_bias(split_scale_bias_mode=2),
@@ -184,43 +215,6 @@ def _initialize_torch_state(  # noqa: C901
                             Shard(tensor=tbe_split_qparam, metadata=qmetadata),
                         )
                     # end of weight_qscale & weight_qbias section
-            if table.pruning_indices_remapping is not None:
-                for (
-                    qparam,
-                    table_name_to_local_shards,
-                    _,
-                ) in [
-                    (
-                        table.pruning_indices_remapping,
-                        self._table_name_to_local_shards_pruning_index_remappings,
-                        self._table_name_to_tensors_list_pruning_index_remappings,
-                    )
-                ]:
-                    parameter_sharding: ParameterSharding = (
-                        table_name_to_parameter_sharding[table.name]
-                    )
-                    sharding_type: str = parameter_sharding.sharding_type
-
-                    assert sharding_type in [
-                        ShardingType.TABLE_WISE.value,
-                        ShardingType.COLUMN_WISE.value,
-                    ]
-
-                    qmetadata = ShardMetadata(
-                        shard_offsets=[0],
-                        shard_sizes=[
-                            qparam.shape[0],
-                        ],
-                        placement=table.local_metadata.placement,
-                    )
-                    # TODO(ivankobzarev): "meta" sharding support: cleanup when copy to "meta" moves all tensors to "meta"
-                    if qmetadata.placement.device != qparam.device:
-                        qmetadata.placement = _remote_device(qparam.device)
-                    _append_table_shard(
-                        table_name_to_local_shards,
-                        table.name,
-                        Shard(tensor=qparam, metadata=qmetadata),
-                    )
 
         for table_name_to_local_shards, table_name_to_sharded_tensor in [
             (self._table_name_to_local_shards, self._table_name_to_sharded_tensor),
@@ -263,65 +257,9 @@ def _initialize_torch_state(  # noqa: C901
                     )
                 )
 
-        for table_name_to_local_shards, table_name_to_sharded_tensor in [
-            (
-                self._table_name_to_local_shards_pruning_index_remappings,
-                self._table_name_to_sharded_tensor_pruning_index_remappings,
-            ),
-        ]:
-            for table_name, local_shards in table_name_to_local_shards.items():
-                # Single Tensor per table (TW sharding)
-                table_name_to_sharded_tensor[table_name] = local_shards[0].tensor
-                continue
-
-        def post_state_dict_hook(
-            # Union["ShardedQuantEmbeddingBagCollection", "ShardedQuantEmbeddingCollection"]
-            module: ShardedQuantEmbeddingModuleState[CompIn, DistOut, Out, ShrdCtx],
-            destination: Dict[str, torch.Tensor],
-            prefix: str,
-            _local_metadata: Dict[str, Any],
-        ) -> None:
-            for (
-                table_name,
-                sharded_t,
-            ) in module._table_name_to_sharded_tensor.items():
-                destination[f"{prefix}{tables_weights_prefix}.{table_name}.weight"] = (
-                    sharded_t
-                )
-
-            for sfx, dict_sharded_t, dict_t_list in [
-                (
-                    "weight_qscale",
-                    module._table_name_to_sharded_tensor_qscale,
-                    module._table_name_to_tensors_list_qscale,
-                ),
-                (
-                    "weight_qbias",
-                    module._table_name_to_sharded_tensor_qbias,
-                    module._table_name_to_tensors_list_qbias,
-                ),
-                (
-                    "index_remappings_array",
-                    module._table_name_to_sharded_tensor_pruning_index_remappings,
-                    module._table_name_to_tensors_list_pruning_index_remappings,
-                ),
-            ]:
-                for (
-                    table_name,
-                    sharded_t,
-                ) in dict_sharded_t.items():
-                    destination[
-                        f"{prefix}{tables_weights_prefix}.{table_name}.{sfx}"
-                    ] = sharded_t
-                for (
-                    table_name,
-                    t_list,
-                ) in dict_t_list.items():
-                    destination[
-                        f"{prefix}{tables_weights_prefix}.{table_name}.{sfx}"
-                    ] = t_list
-
-        self._register_state_dict_hook(post_state_dict_hook)
+        self._register_state_dict_hook(
+            partial(post_state_dict_hook, tables_weights_prefix=tables_weights_prefix)
+        )
 
     def _load_from_state_dict(
         # Union["ShardedQuantEmbeddingBagCollection", "ShardedQuantEmbeddingCollection"]
 
@@ -48,6 +48,7 @@
     ShardingEnv,
     ShardMetadata,
 )
+from torchrec.distributed.utils import none_throws
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
 from torchrec.streamable import Multistreamable
 
@@ -157,7 +158,12 @@ def _shard(
                 shards_metadata=shards,
                 size=torch.Size(
                     [
-                        info.embedding_config.num_embeddings,
+                        (
+                            info.embedding_config.num_embeddings_post_pruning
+                            if info.embedding_config.num_embeddings_post_pruning
+                            is not None
+                            else info.embedding_config.num_embeddings
+                        ),
                         info.embedding_config.embedding_dim,
                     ]
                 ),
@@ -169,7 +175,12 @@ def _shard(
                     mesh=self._env.device_mesh,
                     placements=(Shard(1),),
                     size=(
-                        info.embedding_config.num_embeddings,
+                        (
+                            info.embedding_config.num_embeddings_post_pruning
+                            if info.embedding_config.num_embeddings_post_pruning
+                            is not None
+                            else info.embedding_config.num_embeddings
+                        ),
                         info.embedding_config.embedding_dim,
                     ),
                     stride=info.param.stride(),
@@ -190,7 +201,14 @@ def _shard(
                         pooling=info.embedding_config.pooling,
                         is_weighted=info.embedding_config.is_weighted,
                         has_feature_processor=info.embedding_config.has_feature_processor,
-                        local_rows=info.embedding_config.num_embeddings,
+                        local_rows=(
+                            none_throws(
+                                info.embedding_config.num_embeddings_post_pruning
+                            )
+                            if info.embedding_config.num_embeddings_post_pruning
+                            is not None
+                            else info.embedding_config.num_embeddings
+                        ),
                         local_cols=shards[i].shard_sizes[1],
                         compute_kernel=EmbeddingComputeKernel(
                             info.param_sharding.compute_kernel
@@ -201,7 +219,7 @@ def _shard(
                         fused_params=info.fused_params,
                         weight_init_max=info.embedding_config.weight_init_max,
                         weight_init_min=info.embedding_config.weight_init_min,
-                        pruning_indices_remapping=info.embedding_config.pruning_indices_remapping,
+                        num_embeddings_post_pruning=info.embedding_config.num_embeddings_post_pruning,
                     )
                 )
 
 
@@ -158,7 +158,12 @@ def _shard(
                 shards_metadata=shards,
                 size=torch.Size(
                     [
-                        info.embedding_config.num_embeddings,
+                        (
+                            info.embedding_config.num_embeddings_post_pruning
+                            if info.embedding_config.num_embeddings_post_pruning
+                            is not None
+                            else info.embedding_config.num_embeddings
+                        ),
                         info.embedding_config.embedding_dim,
                     ]
                 ),
@@ -170,7 +175,12 @@ def _shard(
                     mesh=self._env.device_mesh,
                     placements=(Shard(0),),
                     size=(
-                        info.embedding_config.num_embeddings,
+                        (
+                            info.embedding_config.num_embeddings_post_pruning
+                            if info.embedding_config.num_embeddings_post_pruning
+                            is not None
+                            else info.embedding_config.num_embeddings
+                        ),
                         info.embedding_config.embedding_dim,
                     ),
                     stride=info.param.stride(),
@@ -201,6 +211,7 @@ def _shard(
                         weight_init_max=info.embedding_config.weight_init_max,
                         weight_init_min=info.embedding_config.weight_init_min,
                         fused_params=info.fused_params,
+                        num_embeddings_post_pruning=info.embedding_config.num_embeddings_post_pruning,
                     )
                 )
         return tables_per_rank
 
@@ -49,6 +49,7 @@
     ShardingEnv,
     ShardMetadata,
 )
+from torchrec.distributed.utils import none_throws
 from torchrec.sparse.jagged_tensor import KeyedJaggedTensor
 from torchrec.streamable import Multistreamable
 
@@ -103,11 +104,17 @@ def _shard(
             # pyre-fixme [16]
             shards = info.param_sharding.sharding_spec.shards
             # construct the global sharded_tensor_metadata
+
             global_metadata = ShardedTensorMetadata(
                 shards_metadata=shards,
                 size=torch.Size(
                     [
-                        info.embedding_config.num_embeddings,
+                        (
+                            info.embedding_config.num_embeddings_post_pruning
+                            if info.embedding_config.num_embeddings_post_pruning
+                            is not None
+                            else info.embedding_config.num_embeddings
+                        ),
                         info.embedding_config.embedding_dim,
                     ]
                 ),
@@ -139,7 +146,11 @@ def _shard(
                     pooling=info.embedding_config.pooling,
                     is_weighted=info.embedding_config.is_weighted,
                     has_feature_processor=info.embedding_config.has_feature_processor,
-                    local_rows=info.embedding_config.num_embeddings,
+                    local_rows=(
+                        none_throws(info.embedding_config.num_embeddings_post_pruning)
+                        if info.embedding_config.num_embeddings_post_pruning is not None
+                        else info.embedding_config.num_embeddings
+                    ),
                     local_cols=info.embedding_config.embedding_dim,
                     compute_kernel=EmbeddingComputeKernel(
                         info.param_sharding.compute_kernel
@@ -150,7 +161,7 @@ def _shard(
                     weight_init_max=info.embedding_config.weight_init_max,
                     weight_init_min=info.embedding_config.weight_init_min,
                     fused_params=info.fused_params,
-                    pruning_indices_remapping=info.embedding_config.pruning_indices_remapping,
+                    num_embeddings_post_pruning=info.embedding_config.num_embeddings_post_pruning,
                 )
             )
         return tables_per_rank