pytorch
diff --git a/‎test/test_actors.py
+51-2 b/‎test/test_actors.py
+51-2
diff --git a/‎test/test_env.py
+11-12 b/‎test/test_env.py
+11-12
diff --git a/‎torchrl/data/llm/__init__.py
+8-1 b/‎torchrl/data/llm/__init__.py
+8-1
diff --git a/‎torchrl/data/llm/utils.py
+9-1 b/‎torchrl/data/llm/utils.py
+9-1
diff --git a/‎torchrl/envs/custom/llm.py
+33-15 b/‎torchrl/envs/custom/llm.py
+33-15
diff --git a/‎torchrl/envs/transforms/llm.py
+1-2 b/‎torchrl/envs/transforms/llm.py
+1-2
@@ -8,14 +8,14 @@
 import pytest
 import torch
 
-from tensordict import TensorDict
+from tensordict import NonTensorStack, TensorDict
 from tensordict.nn import CompositeDistribution, TensorDictModule
 from tensordict.nn.distributions import NormalParamExtractor
 
 from torch import distributions as dist, nn
 from torchrl.data import Binary, Bounded, Categorical, Composite, MultiOneHot, OneHot
 from torchrl.data.llm.dataset import _has_transformers
-from torchrl.modules import MLP, SafeModule, TanhDelta, TanhNormal
+from torchrl.modules import from_hf_transformers, MLP, SafeModule, TanhDelta, TanhNormal
 from torchrl.modules.tensordict_module.actors import (
     _process_action_space_spec,
     ActorValueOperator,
@@ -907,6 +907,55 @@ def test_lmhead_actorvalueoperator(device):
     ) == len(policy_params)
 
 
+@pytest.mark.skipif(not _has_transformers, reason="missing transformers dependencies")
+class TestTransformerActor:
+    @pytest.mark.parametrize(
+        "from_text, generate, tokens, attention_mask",
+        [
+            (True, True, None, None),
+            (True, False, None, None),
+            (
+                False,
+                True,
+                torch.randint(1024, (1, 10)),
+                torch.ones(1, 10, dtype=torch.int64),
+            ),
+            (False, True, torch.randint(1024, (1, 10)), None),
+        ],
+    )
+    def test_from_hf_transformers(self, from_text, generate, tokens, attention_mask):
+        from torchrl.data.llm import LLMData
+        from transformers import AutoTokenizer, GPT2Config, GPT2LMHeadModel
+
+        tokenizer = AutoTokenizer.from_pretrained("gpt2")
+        tokenizer.pad_token = tokenizer.eos_token
+        model = GPT2LMHeadModel(GPT2Config())
+        tokenizer.padding_side = "left"
+        m = from_hf_transformers(
+            model, tokenizer=tokenizer, from_text=from_text, generate=generate
+        )
+        if from_text:
+            tdin = LLMData(text=NonTensorStack("a text"), batch_size=1)
+        else:
+            tdin = LLMData(tokens=tokens, attention_mask=attention_mask, batch_size=1)
+        td = m(tdin)
+        assert td is tdin
+        assert isinstance(td, LLMData)
+        if from_text and generate:
+            assert td.text_response is not None
+        else:
+            assert td.text_response is None
+        if attention_mask is not None or from_text:
+            assert td.attention_mask is not None
+        else:
+            assert td.attention_mask is None
+        if not generate:
+            assert td.text_response is None
+            assert td.tokens_response is None
+            assert td.log_probs is not None
+            assert td.logits is not None
+
+
 if __name__ == "__main__":
     args, unknown = argparse.ArgumentParser().parse_known_args()
     pytest.main([__file__, "--capture", "no", "--exitfirst"] + unknown)
@@ -4699,13 +4699,13 @@ def test_llm_from_dataloader(
             def policy(td):
                 if str2str:
                     if not td.shape:
-                        td[LLMEnv._DEFAULT_ACTION_KEY] = "<nothing>"
+                        td[LLMEnv._DEFAULT_ACTION_STR_KEY] = "<nothing>"
                     else:
-                        td[LLMEnv._DEFAULT_ACTION_KEY] = NonTensorStack(
+                        td[LLMEnv._DEFAULT_ACTION_STR_KEY] = NonTensorStack(
                             *["<nothing>" for _ in range(td.shape[0])]
                         )
                 else:
-                    td[LLMEnv._DEFAULT_ACTION_KEY] = torch.ones(
+                    td[LLMEnv._DEFAULT_ACTION_TOKENS_KEY] = torch.ones(
                         td.shape + (1,), dtype=torch.int64
                     )
                 return td
@@ -4720,25 +4720,25 @@ def policy(td):
                     assert (
                         r[0, 0][LLMEnv._DEFAULT_STR_KEY]
                         == r[0, 1][LLMEnv._DEFAULT_STR_KEY][
-                            : -len(r[0, 0][LLMEnv._DEFAULT_ACTION_KEY])
+                            : -len(r[0, 0][LLMEnv._DEFAULT_ACTION_STR_KEY])
                         ]
                     )
                     assert (
                         r[0, 1][LLMEnv._DEFAULT_STR_KEY]
                         == r[0, 2][LLMEnv._DEFAULT_STR_KEY][
-                            : -len(r[0, 1][LLMEnv._DEFAULT_ACTION_KEY])
+                            : -len(r[0, 1][LLMEnv._DEFAULT_ACTION_STR_KEY])
                         ]
                     )
                     assert (
                         r[-1, 0][LLMEnv._DEFAULT_STR_KEY]
                         == r[-1, 1][LLMEnv._DEFAULT_STR_KEY][
-                            : -len(r[-1, 0][LLMEnv._DEFAULT_ACTION_KEY])
+                            : -len(r[-1, 0][LLMEnv._DEFAULT_ACTION_STR_KEY])
                         ]
                     )
                     assert (
                         r[-1, 1][LLMEnv._DEFAULT_STR_KEY]
                         == r[-1, 2][LLMEnv._DEFAULT_STR_KEY][
-                            : -len(r[-1, 1][LLMEnv._DEFAULT_ACTION_KEY])
+                            : -len(r[-1, 1][LLMEnv._DEFAULT_ACTION_STR_KEY])
                         ]
                     )
                 else:
@@ -4815,13 +4815,13 @@ def test_llm_from_dataloader_repeats(
         def policy(td):
             if str2str:
                 if not td.shape:
-                    td[LLMEnv._DEFAULT_ACTION_KEY] = "<nothing>"
+                    td[LLMEnv._DEFAULT_ACTION_STR_KEY] = "<nothing>"
                 else:
-                    td[LLMEnv._DEFAULT_ACTION_KEY] = NonTensorStack(
+                    td[LLMEnv._DEFAULT_ACTION_STR_KEY] = NonTensorStack(
                         *["<nothing>" for _ in range(td.shape[0])]
                     )
             else:
-                td[LLMEnv._DEFAULT_ACTION_KEY] = torch.ones(
+                td[LLMEnv._DEFAULT_ACTION_TOKENS_KEY] = torch.ones(
                     td.shape + (1,), dtype=torch.int64
                 )
             return td
@@ -4957,7 +4957,7 @@ def test_done_and_reward(
             env.append_transform(StepCounter(max_steps=10))
 
             def policy(td):
-                td[LLMEnv._DEFAULT_ACTION_KEY] = torch.ones(
+                td[LLMEnv._DEFAULT_ACTION_TOKENS_KEY] = torch.ones(
                     td.shape + (torch.randint(10, (1,)).item(),), dtype=torch.int64
                 )
                 return td
@@ -4974,7 +4974,6 @@ def policy(td):
             if assign_done:
                 assert "terminated" in r
                 assert "done" in r
-            print(r)
 
 
 if __name__ == "__main__":
 
@@ -11,7 +11,14 @@
 )
 from .prompt import PromptData, PromptTensorDictTokenizer
 from .reward import PairwiseDataset, RewardData
-from .utils import AdaptiveKLController, ConstantKLController, RolloutFromModel, LLMData, LLMOutput, LLMInput
+from .utils import (
+    AdaptiveKLController,
+    ConstantKLController,
+    LLMData,
+    LLMInput,
+    LLMOutput,
+    RolloutFromModel,
+)
 
 __all__ = [
     "AdaptiveKLController",
 
@@ -543,8 +543,10 @@ def step_scheduler(self):
             while len(self._kl_queue):
                 self._kl_queue.remove(self._kl_queue[0])
 
+
 LLMInpOut = TypeVar("LLMInpOut")
 
+
 class LLMInput(TensorClass["nocast"]):
     """Represents the input to a Large Language Model (LLM).
 
@@ -557,11 +559,13 @@ class LLMInput(TensorClass["nocast"]):
     .. seealso:: :class:`~torchrl.data.LLMOutput` and :class:`~torchrl.data.LLMData`.
 
     """
+
     tokens: torch.Tensor
     attention_mask: torch.Tensor | None = None
     token_list: list[int] | list[list[int]] | None = None
     text: str | list[str] | None = None
 
+
 class LLMOutput(TensorClass["nocast"]):
     """Represents the output from a Large Language Model (LLM).
 
@@ -581,6 +585,7 @@ class LLMOutput(TensorClass["nocast"]):
     .. seealso:: :class:`~torchrl.data.LLMInput` and :class:`~torchrl.data.LLMData`.
 
     """
+
     tokens: torch.Tensor
     tokens_response: torch.Tensor | None = None
     token_list: list[int] | list[list[int]] | None = None
@@ -594,6 +599,7 @@ def from_vllm_output(cls: type[LLMInpOut], vllm_output) -> LLMInpOut:
         # placeholder
         raise NotImplementedError
 
+
 class LLMData(TensorClass["nocast"]):
     """Represents the input or output of a Large Language Model (LLM).
 
@@ -619,11 +625,13 @@ class LLMData(TensorClass["nocast"]):
     .. seealso:: :class:`~torchrl.data.LLMInput` and :class:`~torchrl.data.LLMOutput`.
 
     """
-    tokens: torch.Tensor
+
+    tokens: torch.Tensor | None = None
     tokens_response: torch.Tensor | None = None
     attention_mask: torch.Tensor | None = None
     token_list: list[int] | list[list[int]] | None = None
     tokens_response_list: list[list[int]] | None = None
     logits: torch.Tensor | None = None
     log_probs: torch.Tensor | None = None
     text: str | list[str] | None = None
+    text_response: torch.Tensor | None = None
@@ -42,13 +42,13 @@ class LLMEnv(EnvBase):
 
     Keyword Args:
         token_key (NestedKey, optional): The key in the tensordict where the tokens are stored (when `str2str=False`).
-            Defaults to ``("tokens_in", "input_ids")``.
+            Defaults to ``"tokens"``.
         str_key (NestedKey, optional): The key in the tensordict where the string input is stored (when `str2str=True`).
-            Defaults to ``"test"``.
+            Defaults to ``"text"``.
         attention_key (NestedKey, optional): The key in the tensordict where the attention mask is stored.
-            Defaults to ``("tokens_in", "input_ids")``
+            Defaults to ``"attention_mask"``.
         action_key (NestedKey, optional): The key in the tensordict where the action is stored. Defaults to
-            ``("tokens_out", "sequences")``.
+            ``tokens_response`` or ``"text_response"``.
         reward_key (NestedKey, optional): The key in the tensordict where the reward is stored if `assign_reward=True`.
             Defaults to  ``"reward"``.
         str2str (bool, optional): Whether the environment should expect strings as input and output. Defaults to ``False``.
@@ -71,6 +71,8 @@ class LLMEnv(EnvBase):
         batch_size (int or torch.Size, optional): Batch size of the environment. If left empty, the environment
             is batchless (or batch-unlocked), meaning that it can accept tensordicts of any batch size.
             Defaults to ``None`` (batch-unlocked).
+        as_llm_data (bool, optional): If ``True``, the data will be of type :class:`~torchrl.data.LLMData`.
+            Defaults to ``False``.
 
     .. seealso:: :class:`~torchrl.envs.DataLoadingPrimer` for examples.
 
@@ -79,10 +81,11 @@ class LLMEnv(EnvBase):
 
     """
 
-    _DEFAULT_TOKEN_KEY = ("tokens_in", "input_ids")
+    _DEFAULT_TOKEN_KEY = "tokens"
     _DEFAULT_STR_KEY = "text"
-    _DEFAULT_ATTENTION_KEY = ("tokens_in", "attention_mask")
-    _DEFAULT_ACTION_KEY = ("tokens_out", "sequences")
+    _DEFAULT_ATTENTION_KEY = "attention_mask"
+    _DEFAULT_ACTION_TOKENS_KEY = "tokens_response"
+    _DEFAULT_ACTION_STR_KEY = "text_response"
 
     def __init__(
         self,
@@ -100,15 +103,20 @@ def __init__(
         assign_done: bool = False,
         batch_size: int | torch.Size | None = None,
         has_attention: bool = True,
+        as_llm_data: bool = False,
     ) -> None:
+        self.as_llm_data = as_llm_data
         if token_key is None:
             token_key = self._DEFAULT_TOKEN_KEY
         if str_key is None:
             str_key = self._DEFAULT_STR_KEY
         if attention_key is None:
             attention_key = self._DEFAULT_ATTENTION_KEY
         if action_key is None:
-            action_key = self._DEFAULT_ACTION_KEY
+            if str2str:
+                action_key = self._DEFAULT_ACTION_STR_KEY
+            else:
+                action_key = self._DEFAULT_ACTION_TOKENS_KEY
         if batch_size is None:
             self._batch_locked = False
             batch_size = ()
@@ -206,7 +214,7 @@ def __init__(
         else:
             # Use single done
             self.full_done_spec_unbatched = Composite(
-                tokens=Composite(
+                tokens_data=Composite(
                     done=Unbounded(shape=(-1,), dtype=torch.bool),
                     terminated=Unbounded(shape=(-1,), dtype=torch.bool),
                 ),
@@ -228,6 +236,7 @@ def from_dataloader(
         device: torch.device | None = None,
         vocab_size: int | None = None,
         no_stack: bool = False,
+        as_llm_data: bool = False,
         batch_size: int | torch.Size | None = None,
         has_attention: bool = True,
         assign_reward: bool = False,
@@ -288,6 +297,8 @@ def from_dataloader(
             repeats (int, optional): How many times the same sample needs to appear successively. This can be useful in
                 situations like GRPO where a single prompt is used multiple times to estimate the advantage using Monte-Carlo
                 samples (rather than an advantage module).
+            as_llm_data (bool, optional): If ``True``, the data will be of type :class:`~torchrl.data.LLMData`.
+                Defaults to ``False``.
 
         Returns:
             LLMEnv: The created LLMEnv instance.
@@ -334,6 +345,7 @@ def from_dataloader(
             assign_done=assign_done,
             batch_size=batch_size,
             has_attention=has_attention,
+            as_llm_data=as_llm_data,
         )
         return env.append_transform(primer)
 
@@ -353,6 +365,8 @@ def _step(
         self._make_next_obs(tensordict, next_td)
         self._maybe_make_reward(tensordict, next_td)
         self._maybe_make_done(tensordict, next_td)
+        if self.as_llm_data:
+            raise NotImplementedError()
         return next_td
 
     def _maybe_make_reward(
@@ -378,14 +392,14 @@ def _maybe_make_done(
                 )
             else:
                 done = torch.zeros_like(action, dtype=torch.bool)
-            next_td.set(("tokens", "terminated"), done)
-            next_td.set(("tokens", "done"), done.clone())
+            next_td.set(("tokens_data", "terminated"), done)
+            next_td.set(("tokens_data", "done"), done.clone())
             next_td.set(
-                "terminated", next_td.get(("tokens", "done")).any(-1, keepdim=True)
+                "terminated", next_td.get(("tokens_data", "done")).any(-1, keepdim=True)
             )
             next_td.set(
                 "terminated",
-                next_td.get(("tokens", "terminated")).any(-1, keepdim=True),
+                next_td.get(("tokens_data", "terminated")).any(-1, keepdim=True),
             )
         return next_td
 
@@ -400,7 +414,8 @@ def _make_next_obs(
             if self.has_attention:
                 attention_mask = tensordict.get(self.attention_key)
                 n = action.shape[-1] - attention_mask.shape[-1]
-                if n:
+                if n > 0:
+                    # It can happen that there's only one action (eg rand_action)
                     attention_mask = torch.cat(
                         [
                             attention_mask,
@@ -471,7 +486,10 @@ def check_str():
                 f"torchrl.envs.DataLoadingPrimer) is appended to the env transforms."
             )
         td_reset = tensordict.copy()
-        return self._maybe_make_done(tensordict, td_reset)
+        tensordict = self._maybe_make_done(tensordict, td_reset)
+        if self.as_llm_data:
+            raise NotImplementedError()
+        return tensordict
 
     def _set_seed(self, seed: int | None):
         return seed
 
@@ -417,8 +417,6 @@ def __init__(
         else:
             self.data_keys = list(primers.keys(True, True))
 
-        self._reset_key = "_reset"
-
         super().__init__(
             primers=primers,
             default_value=self._load_from_dataloader,
@@ -427,6 +425,7 @@ def __init__(
             single_default_value=True,
             call_before_env_reset=True,
         )
+        self._reset_key = "_reset"
 
     @classmethod
     def _endless_iter(self, obj):