-
-
Notifications
You must be signed in to change notification settings - Fork 4.2k
Open
Labels
Description
Hi all! I'm having issues with finetuning Devstral2 after updating unsloth and transformers packages. GPT_OSS also returns similar error. Any help would be appreciated!
unsloth = 2025.12.9
unsloth_zoo = 2025.12.7
transformers = 5.0.0.dev0
torch = 2.9.1
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = MODEL_DIR,
max_seq_length = MAX_SEQ_LENGTH,
full_finetuning = False,
local_files_only = True,
load_in_4bit = False,
device_map = "balanced",
max_memory = {0:"78GiB", 1:"78GiB", 2:"78GiB", 3:"78GiB", "cpu":"256GiB"},
offload_folder = offload_dir,
trust_remote_code = True,
config = config,
)
model = FastLanguageModel.get_peft_model(
model,
r = 4,
finetune_language_layers = True,
finetune_attention_modules = True,
finetune_mlp_modules = True,
lora_alpha = 8,
lora_dropout = 0.0,
bias = "none",
use_gradient_checkpointing = "unsloth",
random_state = 3407,
use_rslora = False,
loftq_config = None,
)trainer = SFTTrainer(
model = model,
tokenizer = tokenizer,
train_dataset = dataset,
eval_dataset = None,
args = SFTConfig(
dataset_text_field = "text",
per_device_train_batch_size = 2,
gradient_accumulation_steps = 2,
warmup_steps = 5,
max_steps = 15,
learning_rate = 2e-4,
logging_steps = 1,
optim = "adamw_8bit",
weight_decay = 0.001,
lr_scheduler_type = "linear",
seed = 3407,
report_to = "none",
),
)Full log:
File "/root/oss_aws/devstral_test.py", line 273, in <module>
_ = trainer.model(input_ids=x, attention_mask=torch.ones_like(x))
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal
l_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/peft/peft_model.py", line 1850, in forward
return self.base_model(
^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal
l_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/peft/tuners/tuners_utils.py", line 222, in forward
return self.model.forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward
output = module._old_forward(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/transformers/src/transformers/utils/generic.py", line 810, in wrapper
output = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/transformers/src/transformers/models/ministral3/modeling_ministral3.py", line 473, in fo
rward
outputs: BaseModelOutputWithPast = self.model(
^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal
l_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/transformers/src/transformers/utils/generic.py", line 965, in wrapper
outputs = func(self, *args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/transformers/src/transformers/models/ministral3/modeling_ministral3.py", line 409, in fo
rward
hidden_states = decoder_layer(
^^^^^^^^^^^^^^
File "/root/transformers/src/transformers/modeling_layers.py", line 94, in __call__
return super().__call__(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal
l_impl
return self._call_impl(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl
return forward_call(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/root/uns/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward |
output = module._old_forward(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/transformers/src/transformers/models/ministral3/modeling_ministral3.py", line 242, in fo|
rward |
hidden_states, _ = self.self_attn( |
^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal|
l_impl |
return self._call_impl(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl |
return forward_call(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward |
output = module._old_forward(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/transformers/src/transformers/models/ministral3/modeling_ministral3.py", line 144, in fo|
rward |
query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2) |
^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal|
l_impl |
return self._call_impl(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl |
return forward_call(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/peft/tuners/lora/layer.py", line 757, in forward |
result = self.base_layer(x, *args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1775, in _wrapped_cal|
l_impl |
return self._call_impl(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/torch/nn/modules/module.py", line 1786, in _call_impl |
return forward_call(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/accelerate/hooks.py", line 175, in new_forward |
output = module._old_forward(*args, **kwargs) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/unsloth/kernels/fp8.py", line 590, in patched_forward |
return forward_function(X, self.weight, getattr(self, scale_attr)) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/unsloth/kernels/fp8.py", line 392, in fp8_torch_block_q|
uant_forward |
return FP8BlockQuantLinear.apply(X, weight, weight_scale) |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/torch/autograd/function.py", line 581, in apply |
return super().apply(*args, **kwargs) # type: ignore[misc] |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ |
File "/root/uns/lib/python3.12/site-packages/unsloth/kernels/fp8.py", line 347, in forward |
p, q = weight_scale.shape |
^^^^ |
ValueError: not enough values to unpack (expected 2, got 0)