Small bug fixes for running models without tokenizers (#168)

coryMosaicML · web-flow · commit c1f953f80a60 · 2024-08-28T11:28:04.000-07:00
diff --git a/diffusion/callbacks/log_diffusion_images.py b/diffusion/callbacks/log_diffusion_images.py
@@ -98,7 +98,7 @@ def __init__(self,
                 latent_batch = {}
                 tokenized_t5 = t5_tokenizer(batch,
                                             padding='max_length',
-                                            max_length=t5_tokenizer.model.max_length,
+                                            max_length=t5_tokenizer.model_max_length,
                                             truncation=True,
                                             return_tensors='pt')
                 t5_attention_mask = tokenized_t5['attention_mask'].to(torch.bool).cuda()
@@ -108,7 +108,7 @@ def __init__(self,
 
                 tokenized_clip = clip_tokenizer(batch,
                                                 padding='max_length',
-                                                max_length=t5_tokenizer.model.max_length,
+                                                max_length=clip_tokenizer.model_max_length,
                                                 truncation=True,
                                                 return_tensors='pt')
                 clip_attention_mask = tokenized_clip['attention_mask'].cuda()
diff --git a/diffusion/train.py b/diffusion/train.py
@@ -88,17 +88,20 @@ def train(config: DictConfig) -> None:
 
     model: ComposerModel = hydra.utils.instantiate(config.model)
 
+    # If the model has a tokenizer, we'll need it for the dataset
+    if hasattr(model, 'tokenizer'):
+        tokenizer = model.tokenizer
+    else:
+        tokenizer = None
+
     if hasattr(model, 'autoencoder_loss'):
         # Check if this is training an autoencoder. If so, the optimizer needs different param groups
         optimizer = make_autoencoder_optimizer(config, model)
-        tokenizer = None
     elif isinstance(model, ComposerTextToImageMMDiT):
         # Check if this is training a transformer. If so, the optimizer needs different param groups
         optimizer = make_transformer_optimizer(config, model)
-        tokenizer = model.tokenizer
     else:
         optimizer = hydra.utils.instantiate(config.optimizer, params=model.parameters())
-        tokenizer = model.tokenizer
 
     # Load train dataset. Currently this expects to load according to the datasetHparam method.
     # This means adding external datasets is currently not super easy. Will refactor or check for