Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions fast_llm/engine/checkpoint/huggingface.py
Original file line number Diff line number Diff line change
Expand Up @@ -139,6 +139,7 @@ def _export_config(cls, config: FastLLMModelConfig) -> dict[str, typing.Any]:
"bos_token_id",
"decoder_start_token_id",
"eos_token_id",
"mask_token_id",
"pad_token_id",
"sep_token_id",
# Initialization / pretraining metadata Fast-LLM does not consume.
Expand Down
2 changes: 2 additions & 0 deletions fast_llm/models/gpt/conversion/llama.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,7 @@ def _llama_rotary_export(config: AttentionConfig) -> dict:
rope_parameters.update(
{
"rope_type": "yarn",
"factor": rotary.scale_factor,
"attention_factor": rotary.attention_factor,
"beta_fast": rotary.beta_fast,
"beta_slow": rotary.beta_slow,
Expand Down Expand Up @@ -132,6 +133,7 @@ def _llama_rotary_import(hf_dict: dict) -> dict:
elif rope_type == "yarn":
rotary_config.update(
{
"scale_factor": rope_params["factor"],
"attention_factor": rope_params["attention_factor"],
"beta_fast": rope_params["beta_fast"],
"beta_slow": rope_params["beta_slow"],
Expand Down
15 changes: 11 additions & 4 deletions tests/utils/model_configs.py
Original file line number Diff line number Diff line change
Expand Up @@ -454,8 +454,12 @@ def update_and_add_testing_config(
# Megatron doesn't support Yarn-style Rotary Embeddings
megatron_args=None,
checkpoint_format=DiffusionLlamaCheckpointFormat,
# TODO: Conversion is broken.
# TODO: Add back generate as `normal` when stable.
# Config + weight conversion works (test_conversion passes). The convert group stays `broken`
# because test_huggingface_model fails: the custom modeling `from_pretrained` requires a
# generation_config.json that Fast-LLM does not export (unlike `dream`, which ships one). Behind
# that, the forward likely diverges as for `dream` — DiffusionLlama is a bidirectional
# diffusion LM, while Fast-LLM runs it causal — but that is unverified since loading fails first.
# Neither is a converter bug. `generate` is broken for the same diffusion-decoding reason.
groups={
ModelTestingGroup.basic: ModelTestingGroupAction.unimportant,
ModelTestingGroup.checkpoint: ModelTestingGroupAction.normal,
Expand Down Expand Up @@ -530,8 +534,11 @@ def update_and_add_testing_config(
# Megatron doesn't support per sub layer biases.
megatron_args=None,
checkpoint_format=DiffusionDreamCheckpointFormat,
# TODO: Conversion is broken.
# TODO: Add back generate as `normal` when stable.
# Config + weight conversion works (test_conversion passes). The convert group stays `broken`
# because test_huggingface_model fails: Dream is a bidirectional diffusion LM, so the HF forward
# diverges from Fast-LLM's causal run (structurally different logits/hidden states, confirmed — not
# a tolerance miss). Matching it needs bidirectional-attention modeling, not a converter change.
# `generate` is broken for the same diffusion-decoding reason.
groups={
ModelTestingGroup.basic: ModelTestingGroupAction.unimportant,
ModelTestingGroup.checkpoint: ModelTestingGroupAction.broken,
Expand Down
Loading