Add new interfaces to SSD TBE for checkpoint saving and loading

emlin · facebook-github-bot · commit 65a119c85c50 · 2025-05-06T12:28:27.000-07:00
Summary: These methods are added to integrate with MVAI trainer and torchrec for checkpoint saving and loading.

Differential Revision: D74265651
diff --git a/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py b/fbgemm_gpu/fbgemm_gpu/tbe/ssd/training.py
@@ -17,7 +17,7 @@
 import threading
 import time
 from math import floor, log2
-from typing import Any, Callable, List, Optional, Tuple, Type, Union
+from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union
 import torch  # usort:skip
 
 # @manual=//deeplearning/fbgemm/fbgemm_gpu/codegen:split_embedding_codegen_lookup_invokers
@@ -156,6 +156,7 @@ def __init__(
         lazy_bulk_init_enabled: bool = False,
         backend_type: BackendType = BackendType.SSD,
         kv_zch_params: Optional[KVZCHParams] = None,
+        enable_optimizer_offloading: bool = False,  # whether enable optimizer offloading
     ) -> None:
         super(SSDTableBatchedEmbeddingBags, self).__init__()
 
@@ -435,6 +436,21 @@ def __init__(
         self.backend_type = backend_type
         if self.kv_zch_params:
             self.kv_zch_params.validate()
+        self.enable_optimizer_offloading: bool = enable_optimizer_offloading
+        # initial num of embeddings on this rank used for loading
+        self.local_weight_counts: List[int] = [0] * T_
+        self.load_state_dict: bool = False
+        # cache loaded id, weights and optimizer temporarily before write to backend for kvzch
+        # since checkpoint loading does not guarantee tensor loading order, but we need to make
+        # sure id tensor is loaded before weight tensor and optimizer tensor can be applied
+        # to backend. So we cache them here and apply them to backend after all tensors are loaded.
+        # with this solution, we'll duplicate the memory usage for id, weight and optimizer tensors
+        # unexpectedly, but it's a tradeoff we can take to make sure the correctness of checkpoint loading
+        # before the backend support streaming loading with id tensor first.
+        self._cached_id_tensor_per_table: Optional[List[torch.Tensor]] = None
+        self._cached_weight_tensor_per_table: Optional[List[torch.Tensor]] = None
+        self._cached_optimizer_state_per_table: Optional[List[torch.Tensor]] = None
+        self._cached_bucket_splits: Optional[List[torch.Tensor]] = None
 
         # create tbe unique id using rank index | local tbe idx
         if tbe_unique_id == -1:
@@ -1777,6 +1793,46 @@ def debug_split_optimizer_states(self) -> List[Tuple[torch.Tensor, int, int]]:
                 for t, row in enumerate(rows)
             ]
 
+    @torch.jit.export
+    def split_optimizer_states(
+        self,
+        sorted_id_tensor: Optional[List[torch.Tensor]] = None,
+    ) -> List[torch.Tensor]:
+        """
+        Returns a list of optimizer states split by table. So far, we only support EXACT_ROWWISE_ADAGRAD,
+        so only momentum1 state is returned.
+
+        Since EXACT_ROWWISE_ADAGRAD has small optimizer states, we would generate
+        a full tensor for each table (shard). When other optimizer types are supported,
+        we should integrate with KVTensorWrapper (ssd_split_table_batched_embeddings.cpp)
+        to allow caller to read the optimizer states using `narrow()` in a rolling-window manner.
+
+        Args:
+            sorted_id_tensor (Optional[List[torch.Tensor]]): sorted id tensor by table, used to query optimizer
+            state from backend. Call should reuse the generated id tensor from weight state_dict, to guarantee
+            id consistency between weight and optimizer states.
+
+        """
+        raise NotImplementedError(
+            "split_optimizer_states is not implemented for SSDTableBatchedEmbeddingBags"
+        )
+
+    @torch.jit.export
+    def get_optimizer_state(
+        self,
+        sorted_id_tensor: Optional[List[torch.Tensor]],
+    ) -> List[Dict[str, torch.Tensor]]:
+        """
+        Returns a list of optimizer states split by table. So far, we only support EXACT_ROWWISE_ADAGRAD
+        so only momentum1 state is returned.
+        """
+        return [
+            ({"momentum1": states})
+            for states in self.split_optimizer_states(
+                sorted_id_tensor=sorted_id_tensor,
+            )
+        ]
+
     @torch.jit.export
     def debug_split_embedding_weights(self) -> List[torch.Tensor]:
         """
@@ -1822,6 +1878,11 @@ def debug_split_embedding_weights(self) -> List[torch.Tensor]:
 
         return splits
 
+    def clear_cache(self) -> None:
+        self._cached_bucket_splits = None
+        self._cached_id_tensor_per_table = None
+        self._cached_weight_tensor_per_table = None
+
     @torch.jit.export
     def split_embedding_weights(
         self,
@@ -1926,6 +1987,18 @@ def split_embedding_weights(
             )
         return (pmt_splits, bucket_sorted_id_splits, active_id_cnt_per_bucket_split)
 
+    @torch.jit.ignore
+    def apply_state_dict(self) -> None:
+        # After checkpoint loading, the _cached_bucket_splits, _cached_id_tensor_per_table
+        # and _cached_weight_tensor_per_table will be loaded from checkpoint. Caller should
+        # call this function to apply the cached states to backend.
+        pass
+
+    @torch.jit.ignore
+    def enable_load_state_dict_mode(self) -> None:
+        # Enable load state dict mode before loading checkpoint
+        pass
+
     @torch.jit.export
     def set_learning_rate(self, lr: float) -> None:
         """