Skip to content

[Bug] Unable to train Devstral2 #3862

@Klepackp

Description

@Klepackp

Hi all! I'm having issues with finetuning Devstral2 after updating unsloth and transformers packages. GPT_OSS also returns similar error. Any help would be appreciated!

unsloth = 2025.12.9
unsloth_zoo = 2025.12.7
transformers = 5.0.0.dev0
torch = 2.9.1

model, tokenizer = FastLanguageModel.from_pretrained(                                                 
    model_name        = MODEL_DIR,                                                                                                                                           
    max_seq_length    = MAX_SEQ_LENGTH,                                                               
    full_finetuning   = False,                                                                 
    local_files_only  = True,                                                                         
    load_in_4bit      = False,                                                             
    device_map        = "balanced",        
    max_memory        = {0:"78GiB", 1:"78GiB", 2:"78GiB", 3:"78GiB", "cpu":"256GiB"},
    offload_folder    = offload_dir,
    trust_remote_code = True,
    config            = config,
)
model = FastLanguageModel.get_peft_model(
    model,
    r = 4,
    finetune_language_layers   = True, 
    finetune_attention_modules = True, 
    finetune_mlp_modules       = True, 
    lora_alpha = 8,
    lora_dropout = 0.0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    eval_dataset = None, 
    args = SFTConfig(
        dataset_text_field = "text",
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 2, 
        warmup_steps = 5,
        max_steps = 15,
        learning_rate = 2e-4, 
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.001,
        lr_scheduler_type = "linear",
        seed = 3407,
        report_to = "none", 
    ),
)

Full log:


  File "/root/oss_aws/devstral_test.py", line 273, in <module>
    _ = trainer.model(input_ids=x, attention_mask=torch.ones_like(x))
        ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal
l_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/peft/peft_model.py", line 1850, in forward
    return self.base_model(
           ^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal
l_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/peft/tuners/tuners_utils.py", line 222, in forward
    return self.model.forward(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward
    output = module._old_forward(*args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/transformers/src/transformers/utils/generic.py", line 810, in wrapper
    output = func(self, *args, **kwargs)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/transformers/src/transformers/models/ministral3/modeling_ministral3.py", line 473, in fo
rward
    outputs: BaseModelOutputWithPast = self.model(
                                       ^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal
l_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/transformers/src/transformers/utils/generic.py", line 965, in wrapper
    outputs = func(self, *args, **kwargs)
              ^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/transformers/src/transformers/models/ministral3/modeling_ministral3.py", line 409, in fo
rward
    hidden_states = decoder_layer(
                    ^^^^^^^^^^^^^^
  File "/root/transformers/src/transformers/modeling_layers.py", line 94, in __call__
    return super().__call__(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal
l_impl
    return self._call_impl(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
    return forward_call(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^   
File "/root/uns/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward         |
    output = module._old_forward(*args, **kwargs)                                                     |
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                     |
  File "/root/transformers/src/transformers/models/ministral3/modeling_ministral3.py", line 242, in fo|
rward                                                                                                 |
    hidden_states, _ = self.self_attn(                                                                |
                       ^^^^^^^^^^^^^^^                                                                |
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal|
l_impl                                                                                                |
    return self._call_impl(*args, **kwargs)                                                           |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                           |
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl  |
    return forward_call(*args, **kwargs)                                                              |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                              |
  File "/root/uns/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward         |
    output = module._old_forward(*args, **kwargs)                                                     |
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                     |
  File "/root/transformers/src/transformers/models/ministral3/modeling_ministral3.py", line 144, in fo|
rward                                                                                                 |
    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)                      |
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^                                                         |
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal|
l_impl                                                                                                |
    return self._call_impl(*args, **kwargs)                                                           |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                           |
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl  |
    return forward_call(*args, **kwargs)                                                              |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                              |
  File "/root/uns/lib/python3.12/site-packages/peft/tuners/lora/layer.py", line 757, in forward       |
    result = self.base_layer(x, *args, **kwargs)                                                      |
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                      |
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal|
l_impl                                                                                                |
    return self._call_impl(*args, **kwargs)                                                           |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                           |
  File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl  |
    return forward_call(*args, **kwargs)                                                              |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                              |
  File "/root/uns/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward         |
    output = module._old_forward(*args, **kwargs)                                                     |
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                     |
  File "/root/uns/lib/python3.12/site-packages/unsloth/kernels/fp8.py", line 590, in patched_forward  |
    return forward_function(X, self.weight, getattr(self, scale_attr))                                |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                |
  File "/root/uns/lib/python3.12/site-packages/unsloth/kernels/fp8.py", line 392, in fp8_torch_block_q|
uant_forward                                                                                          |
    return FP8BlockQuantLinear.apply(X, weight, weight_scale)                                         |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                         |
  File "/root/uns/lib/python3.12/site-packages/torch/autograd/function.py", line 581, in apply        |
    return super().apply(*args, **kwargs)  # type: ignore[misc]                                       |
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^                                                             |
  File "/root/uns/lib/python3.12/site-packages/unsloth/kernels/fp8.py", line 347, in forward          |
    p, q = weight_scale.shape                                                                         |
    ^^^^                                                                                              |
ValueError: not enough values to unpack (expected 2, got 0) 

Metadata

Metadata

Assignees

Type

No type

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions