-
Notifications
You must be signed in to change notification settings - Fork 7.7k
Description
Reminder
- I have read the above rules and searched the existing issues.
System Info
root@715dd9ae772a:/app# export CUDA_VISIBLE_DEVICES=0,1
root@715dd9ae772a:/app# export GRADIO_SHARE=1
root@715dd9ae772a:/app# export USE_MODELSCOPE_HUB=1
root@715dd9ae772a:/app# llamafactory-cli webui
/opt/conda/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
import pkg_resources
Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860
- Running on local URL: http://0.0.0.0:7860
Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.
[INFO|2025-12-04 00:28:27] llamafactory.launcher:143 >> Initializing 2 distributed tasks at: 127.0.0.1:58015
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:143 >> Set ddp_find_unused_parameters to False in DDP training since LoRA is enabled.
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:468 >> Process rank: 0, world size: 2, device: cuda:0, distributed training: True, compute dtype: torch.bfloat16
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:468 >> Process rank: 1, world size: 2, device: cuda:1, distributed training: True, compute dtype: torch.bfloat16
配置文件:
llamafactory-cli train
--stage sft
--do_train True
--model_name_or_path LLM-Research/Meta-Llama-3-8B-Instruct
--preprocessing_num_workers 16
--finetuning_type lora
--template llama3
--flash_attn auto
--dataset_dir data
--dataset alpaca_zh_smart_mine
--cutoff_len 2048
--learning_rate 5e-05
--num_train_epochs 3.0
--max_samples 100000
--per_device_train_batch_size 2
--gradient_accumulation_steps 8
--lr_scheduler_type cosine
--max_grad_norm 1.0
--logging_steps 5
--save_steps 100
--warmup_steps 0
--packing False
--enable_thinking True
--report_to none
--output_dir saves/Llama-3-8B-Instruct/lora/train_2025-12-04-00-31-23
--bf16 True
--plot_loss True
--trust_remote_code True
--ddp_timeout 180000000
--include_num_input_tokens_seen True
--optim adamw_torch
--lora_rank 8
--lora_alpha 16
--lora_dropout 0
--lora_target all
Reproduction
root@715dd9ae772a:/app# export CUDA_VISIBLE_DEVICES=0,1
root@715dd9ae772a:/app# export GRADIO_SHARE=1
root@715dd9ae772a:/app# export USE_MODELSCOPE_HUB=1
root@715dd9ae772a:/app# llamafactory-cli webui
/opt/conda/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
import pkg_resources
Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860
* Running on local URL: http://0.0.0.0:7860
Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.
[INFO|2025-12-04 00:28:27] llamafactory.launcher:143 >> Initializing 2 distributed tasks at: 127.0.0.1:58015
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:143 >> Set `ddp_find_unused_parameters` to False in DDP training since LoRA is enabled.
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:468 >> Process rank: 0, world size: 2, device: cuda:0, distributed training: True, compute dtype: torch.bfloat16
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:468 >> Process rank: 1, world size: 2, device: cuda:1, distributed training: True, compute dtype: torch.bfloat16
Others
root@715dd9ae772a:/app# export CUDA_VISIBLE_DEVICES=0,1
root@715dd9ae772a:/app# export GRADIO_SHARE=1
root@715dd9ae772a:/app# export USE_MODELSCOPE_HUB=1
root@715dd9ae772a:/app# llamafactory-cli webui
/opt/conda/lib/python3.11/site-packages/jieba/_compat.py:18: UserWarning: pkg_resources is deprecated as an API. See https://setuptools.pypa.io/en/latest/pkg_resources.html. The pkg_resources package is slated for removal as early as 2025-11-30. Refrain from using this package or pin to Setuptools<81.
import pkg_resources
Visit http://ip:port for Web UI, e.g., http://127.0.0.1:7860
- Running on local URL: http://0.0.0.0:7860
Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.
[INFO|2025-12-04 00:28:27] llamafactory.launcher:143 >> Initializing 2 distributed tasks at: 127.0.0.1:58015
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:143 >> Set ddp_find_unused_parameters to False in DDP training since LoRA is enabled.
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:468 >> Process rank: 0, world size: 2, device: cuda:0, distributed training: True, compute dtype: torch.bfloat16
[INFO|2025-12-04 00:28:34] llamafactory.hparams.parser:468 >> Process rank: 1, world size: 2, device: cuda:1, distributed training: True, compute dtype: torch.bfloat16