From 02844cec63fcd8c3d4c901693da4f55f89d93a71 Mon Sep 17 00:00:00 2001 From: Yoav HaCohen Date: Wed, 16 Jul 2025 17:47:04 +0300 Subject: [PATCH] LTXV-0.9.8: Long shot generation --- README.md | 37 ++++++++++----- ...v-fp8.yaml => ltxv-13b-0.9.8-dev-fp8.yaml} | 4 +- ...0.9.7-dev.yaml => ltxv-13b-0.9.8-dev.yaml} | 4 +- configs/ltxv-13b-0.9.8-distilled-fp8.yaml | 29 ++++++++++++ configs/ltxv-13b-0.9.8-distilled.yaml | 29 ++++++++++++ ....yaml => ltxv-2b-0.9.8-distilled-fp8.yaml} | 4 +- ...lled.yaml => ltxv-2b-0.9.8-distilled.yaml} | 4 +- ltx_video/pipelines/pipeline_ltx_video.py | 45 +++++++++++++++++++ 8 files changed, 137 insertions(+), 19 deletions(-) rename configs/{ltxv-13b-0.9.7-dev-fp8.yaml => ltxv-13b-0.9.8-dev-fp8.yaml} (91%) rename configs/{ltxv-13b-0.9.7-dev.yaml => ltxv-13b-0.9.8-dev.yaml} (91%) create mode 100644 configs/ltxv-13b-0.9.8-distilled-fp8.yaml create mode 100644 configs/ltxv-13b-0.9.8-distilled.yaml rename configs/{ltxv-13b-0.9.7-distilled-fp8.yaml => ltxv-2b-0.9.8-distilled-fp8.yaml} (88%) rename configs/{ltxv-13b-0.9.7-distilled.yaml => ltxv-2b-0.9.8-distilled.yaml} (88%) diff --git a/README.md b/README.md index c8ce023..72928dd 100644 --- a/README.md +++ b/README.md @@ -59,6 +59,20 @@ The model supports image-to-video, keyframe-based animation, video extension (bo # News +## July, 16th, 2025: New Distilled models v0.9.8 with up to 60 seconds of video: +- Long shot generation in LTXV-13B! + * LTX-Video now supports up to 60 seconds of video. + * Compatible also with the official IC-LoRAs. + * Try now in [ComfyUI](https://github.com/Lightricks/ComfyUI-LTXVideo/tree/master/example_workflows/ltxv-13b-i2v-long-multi-prompt.json). +- Release a new distilled models: + * 13B distilled model [ltxv-13b-0.9.8-distilled](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.8-distilled.yaml) + * 2B distilled model [ltxv-2b-0.9.8-distilled](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-2b-0.9.8-distilled.yaml) + * Both models are distilled from the same base model [ltxv-13b-0.9.8-dev](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.8-dev.yaml) and are compatible for use together in the same multiscale pipeline. + * Improved prompt understanding and detail generation + * Includes corresponding FP8 weights and workflows. +- Release a new detailer model [LTX-Video-ICLoRA-detailer-13B-0.9.8](https://huggingface.co/Lightricks/LTX-Video-ICLoRA-detailer-13b-0.9.8) + * Available in [ComfyUI](https://github.com/Lightricks/ComfyUI-LTXVideo/tree/master/example_workflows/ltxv-13b-upscale.json). + ## July, 8th, 2025: New Control Models Released! - Released three new control models for LTX-Video on HuggingFace: * **Depth Control**: [LTX-Video-ICLoRA-depth-13b-0.9.7](https://huggingface.co/Lightricks/LTX-Video-ICLoRA-depth-13b-0.9.7) @@ -137,12 +151,13 @@ The model supports image-to-video, keyframe-based animation, video extension (bo | Name | Notes | inference.py config | ComfyUI workflow (Recommended) | |-------------------------|--------------------------------------------------------------------------------------------|----------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------| -| ltxv-13b-0.9.7-dev | Highest quality, requires more VRAM | [ltxv-13b-0.9.7-dev.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.7-dev.yaml) | [ltxv-13b-i2v-base.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/ltxv-13b-i2v-base.json) | -| [ltxv-13b-0.9.7-mix](https://app.ltx.studio/motion-workspace?videoModel=ltxv-13b) | Mix ltxv-13b-dev and ltxv-13b-distilled in the same multi-scale rendering workflow for balanced speed-quality | N/A | [ltxv-13b-i2v-mixed-multiscale.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/ltxv-13b-i2v-mixed-multiscale.json) | - [ltxv-13b-0.9.7-distilled](https://app.ltx.studio/motion-workspace?videoModel=ltxv) | Faster, less VRAM usage, slight quality reduction compared to 13b. Ideal for rapid iterations | [ltxv-13b-0.9.7-distilled.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.7-dev.yaml) | [ltxv-13b-dist-i2v-base.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/13b-distilled/ltxv-13b-dist-i2v-base.json) | -| [ltxv-13b-0.9.7-distilled-lora128](https://huggingface.co/Lightricks/LTX-Video/blob/main/ltxv-13b-0.9.7-distilled-lora128.safetensors) | LoRA to make ltxv-13b-dev behave like the distilled model | N/A | N/A | -| ltxv-13b-0.9.7-dev-fp8 | Quantized version of ltxv-13b | [ltxv-13b-0.9.7-dev-fp8.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.7-dev-fp8.yaml) | [ltxv-13b-i2v-base-fp8.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/ltxv-13b-i2v-base-fp8.json) | -| ltxv-13b-0.9.7-distilled-fp8 | Quantized version of ltxv-13b-distilled | [ltxv-13b-0.9.7-distilled-fp8.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.7-distilled-fp8.yaml) | [ltxv-13b-dist-i2v-base-fp8.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/13b-distilled/ltxv-13b-dist-i2v-base-fp8.json) | +| ltxv-13b-0.9.8-dev | Highest quality, requires more VRAM | [ltxv-13b-0.9.8-dev.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.8-dev.yaml) | [ltxv-13b-i2v-base.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/ltxv-13b-i2v-base.json) | +| [ltxv-13b-0.9.8-mix](https://app.ltx.studio/motion-workspace?videoModel=ltxv-13b) | Mix ltxv-13b-dev and ltxv-13b-distilled in the same multi-scale rendering workflow for balanced speed-quality | N/A | [ltxv-13b-i2v-mixed-multiscale.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/ltxv-13b-i2v-mixed-multiscale.json) | + [ltxv-13b-0.9.8-distilled](https://app.ltx.studio/motion-workspace?videoModel=ltxv) | Faster, less VRAM usage, slight quality reduction compared to 13b. Ideal for rapid iterations | [ltxv-13b-0.9.8-distilled.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.8-distilled.yaml) | [ltxv-13b-dist-i2v-base.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/13b-distilled/ltxv-13b-dist-i2v-base.json) | +ltxv-2b-0.9.8-distilled | Smaller model, slight quality reduction compared to 13b distilled. Ideal for fast generation with light VRAM usage | [ltxv-2b-0.9.8-distilled.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-2b-0.9.8-distilled.yaml) | N/A | +| ltxv-13b-0.9.8-dev-fp8 | Quantized version of ltxv-13b | [ltxv-13b-0.9.8-dev-fp8.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.8-dev-fp8.yaml) | [ltxv-13b-i2v-base-fp8.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/ltxv-13b-i2v-base-fp8.json) | +| ltxv-13b-0.9.8-distilled-fp8 | Quantized version of ltxv-13b-distilled | [ltxv-13b-0.9.8-distilled-fp8.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-13b-0.9.8-distilled-fp8.yaml) | [ltxv-13b-dist-i2v-base-fp8.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/13b-distilled/ltxv-13b-dist-i2v-base-fp8.json) | +| ltxv-2b-0.9.8-distilled-fp8 | Quantized version of ltxv-2b-distilled | [ltxv-2b-0.9.8-distilled-fp8.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-2b-0.9.8-distilled-fp8.yaml) | N/A | | ltxv-2b-0.9.6 | Good quality, lower VRAM requirement than ltxv-13b | [ltxv-2b-0.9.6-dev.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-2b-0.9.6-dev.yaml) | [ltxvideo-i2v.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/low_level/ltxvideo-i2v.json) | | ltxv-2b-0.9.6-distilled | 15× faster, real-time capable, fewer steps needed, no STG/CFG required | [ltxv-2b-0.9.6-distilled.yaml](https://github.com/Lightricks/LTX-Video/blob/main/configs/ltxv-2b-0.9.6-distilled.yaml) | [ltxvideo-i2v-distilled.json](https://github.com/Lightricks/ComfyUI-LTXVideo/blob/master/example_workflows/low_level/ltxvideo-i2v-distilled.json) | @@ -170,7 +185,7 @@ cd LTX-Video # create env python -m venv env source env/bin/activate -python -m pip install -e \[inference\] +python -m pip install -e .\[inference\] ``` #### FP8 Kernels (optional) @@ -186,7 +201,7 @@ To use our model, please follow the inference code in [inference.py](./inference #### For image-to-video generation: ```bash -python inference.py --prompt "PROMPT" --conditioning_media_paths IMAGE_PATH --conditioning_start_frames 0 --height HEIGHT --width WIDTH --num_frames NUM_FRAMES --seed SEED --pipeline_config configs/ltxv-13b-0.9.7-distilled.yaml +python inference.py --prompt "PROMPT" --conditioning_media_paths IMAGE_PATH --conditioning_start_frames 0 --height HEIGHT --width WIDTH --num_frames NUM_FRAMES --seed SEED --pipeline_config configs/ltxv-13b-0.9.8-distilled.yaml ``` #### Extending a video: @@ -195,7 +210,7 @@ python inference.py --prompt "PROMPT" --conditioning_media_paths IMAGE_PATH --co ```bash -python inference.py --prompt "PROMPT" --conditioning_media_paths VIDEO_PATH --conditioning_start_frames START_FRAME --height HEIGHT --width WIDTH --num_frames NUM_FRAMES --seed SEED --pipeline_config configs/ltxv-13b-0.9.7-distilled.yaml +python inference.py --prompt "PROMPT" --conditioning_media_paths VIDEO_PATH --conditioning_start_frames START_FRAME --height HEIGHT --width WIDTH --num_frames NUM_FRAMES --seed SEED --pipeline_config configs/ltxv-13b-0.9.8-distilled.yaml ``` #### For video generation with multiple conditions: @@ -204,7 +219,7 @@ You can now generate a video conditioned on a set of images and/or short video s Simply provide a list of paths to the images or video segments you want to condition on, along with their target frame numbers in the generated video. You can also specify the conditioning strength for each item (default: 1.0). ```bash -python inference.py --prompt "PROMPT" --conditioning_media_paths IMAGE_OR_VIDEO_PATH_1 IMAGE_OR_VIDEO_PATH_2 --conditioning_start_frames TARGET_FRAME_1 TARGET_FRAME_2 --height HEIGHT --width WIDTH --num_frames NUM_FRAMES --seed SEED --pipeline_config configs/ltxv-13b-0.9.7-distilled.yaml +python inference.py --prompt "PROMPT" --conditioning_media_paths IMAGE_OR_VIDEO_PATH_1 IMAGE_OR_VIDEO_PATH_2 --conditioning_start_frames TARGET_FRAME_1 TARGET_FRAME_2 --height HEIGHT --width WIDTH --num_frames NUM_FRAMES --seed SEED --pipeline_config configs/ltxv-13b-0.9.8-distilled.yaml ``` ### Using as a library @@ -214,7 +229,7 @@ from ltx_video.inference import infer, InferenceConfig infer( InferenceConfig( - pipeline_config="configs/ltxv-13b-0.9.7-distilled.yaml", + pipeline_config="configs/ltxv-13b-0.9.8-distilled.yaml", prompt=PROMPT, height=HEIGHT, width=WIDTH, diff --git a/configs/ltxv-13b-0.9.7-dev-fp8.yaml b/configs/ltxv-13b-0.9.8-dev-fp8.yaml similarity index 91% rename from configs/ltxv-13b-0.9.7-dev-fp8.yaml rename to configs/ltxv-13b-0.9.8-dev-fp8.yaml index 66e9c69..76b25f1 100644 --- a/configs/ltxv-13b-0.9.7-dev-fp8.yaml +++ b/configs/ltxv-13b-0.9.8-dev-fp8.yaml @@ -1,7 +1,7 @@ pipeline_type: multi-scale -checkpoint_path: "ltxv-13b-0.9.7-dev-fp8.safetensors" +checkpoint_path: "ltxv-13b-0.9.8-dev-fp8.safetensors" downscale_factor: 0.6666666 -spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors" +spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors" stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block" decode_timestep: 0.05 decode_noise_scale: 0.025 diff --git a/configs/ltxv-13b-0.9.7-dev.yaml b/configs/ltxv-13b-0.9.8-dev.yaml similarity index 91% rename from configs/ltxv-13b-0.9.7-dev.yaml rename to configs/ltxv-13b-0.9.8-dev.yaml index ae54825..0c22e9e 100644 --- a/configs/ltxv-13b-0.9.7-dev.yaml +++ b/configs/ltxv-13b-0.9.8-dev.yaml @@ -1,7 +1,7 @@ pipeline_type: multi-scale -checkpoint_path: "ltxv-13b-0.9.7-dev.safetensors" +checkpoint_path: "ltxv-13b-0.9.8-dev.safetensors" downscale_factor: 0.6666666 -spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors" +spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors" stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block" decode_timestep: 0.05 decode_noise_scale: 0.025 diff --git a/configs/ltxv-13b-0.9.8-distilled-fp8.yaml b/configs/ltxv-13b-0.9.8-distilled-fp8.yaml new file mode 100644 index 0000000..444718b --- /dev/null +++ b/configs/ltxv-13b-0.9.8-distilled-fp8.yaml @@ -0,0 +1,29 @@ +pipeline_type: multi-scale +checkpoint_path: "ltxv-13b-0.9.8-distilled-fp8.safetensors" +downscale_factor: 0.6666666 +spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors" +stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block" +decode_timestep: 0.05 +decode_noise_scale: 0.025 +text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS" +precision: "float8_e4m3fn" # options: "float8_e4m3fn", "bfloat16", "mixed_precision" +sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint" +prompt_enhancement_words_threshold: 120 +prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0" +prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct" +stochastic_sampling: false + +first_pass: + timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250] + guidance_scale: 1 + stg_scale: 0 + rescaling_scale: 1 + skip_block_list: [42] + +second_pass: + timesteps: [0.9094, 0.7250, 0.4219] + guidance_scale: 1 + stg_scale: 0 + rescaling_scale: 1 + skip_block_list: [42] + tone_map_compression_ratio: 0.6 diff --git a/configs/ltxv-13b-0.9.8-distilled.yaml b/configs/ltxv-13b-0.9.8-distilled.yaml new file mode 100644 index 0000000..a1ac723 --- /dev/null +++ b/configs/ltxv-13b-0.9.8-distilled.yaml @@ -0,0 +1,29 @@ +pipeline_type: multi-scale +checkpoint_path: "ltxv-13b-0.9.8-distilled.safetensors" +downscale_factor: 0.6666666 +spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors" +stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block" +decode_timestep: 0.05 +decode_noise_scale: 0.025 +text_encoder_model_name_or_path: "PixArt-alpha/PixArt-XL-2-1024-MS" +precision: "bfloat16" +sampler: "from_checkpoint" # options: "uniform", "linear-quadratic", "from_checkpoint" +prompt_enhancement_words_threshold: 120 +prompt_enhancer_image_caption_model_name_or_path: "MiaoshouAI/Florence-2-large-PromptGen-v2.0" +prompt_enhancer_llm_model_name_or_path: "unsloth/Llama-3.2-3B-Instruct" +stochastic_sampling: false + +first_pass: + timesteps: [1.0000, 0.9937, 0.9875, 0.9812, 0.9750, 0.9094, 0.7250] + guidance_scale: 1 + stg_scale: 0 + rescaling_scale: 1 + skip_block_list: [42] + +second_pass: + timesteps: [0.9094, 0.7250, 0.4219] + guidance_scale: 1 + stg_scale: 0 + rescaling_scale: 1 + skip_block_list: [42] + tone_map_compression_ratio: 0.6 diff --git a/configs/ltxv-13b-0.9.7-distilled-fp8.yaml b/configs/ltxv-2b-0.9.8-distilled-fp8.yaml similarity index 88% rename from configs/ltxv-13b-0.9.7-distilled-fp8.yaml rename to configs/ltxv-2b-0.9.8-distilled-fp8.yaml index 151c6fc..c02b205 100644 --- a/configs/ltxv-13b-0.9.7-distilled-fp8.yaml +++ b/configs/ltxv-2b-0.9.8-distilled-fp8.yaml @@ -1,7 +1,7 @@ pipeline_type: multi-scale -checkpoint_path: "ltxv-13b-0.9.7-distilled-fp8.safetensors" +checkpoint_path: "ltxv-2b-0.9.8-distilled-fp8.safetensors" downscale_factor: 0.6666666 -spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors" +spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors" stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block" decode_timestep: 0.05 decode_noise_scale: 0.025 diff --git a/configs/ltxv-13b-0.9.7-distilled.yaml b/configs/ltxv-2b-0.9.8-distilled.yaml similarity index 88% rename from configs/ltxv-13b-0.9.7-distilled.yaml rename to configs/ltxv-2b-0.9.8-distilled.yaml index 9df17bb..9e24b0e 100644 --- a/configs/ltxv-13b-0.9.7-distilled.yaml +++ b/configs/ltxv-2b-0.9.8-distilled.yaml @@ -1,7 +1,7 @@ pipeline_type: multi-scale -checkpoint_path: "ltxv-13b-0.9.7-distilled.safetensors" +checkpoint_path: "ltxv-2b-0.9.8-distilled.safetensors" downscale_factor: 0.6666666 -spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.7.safetensors" +spatial_upscaler_model_path: "ltxv-spatial-upscaler-0.9.8.safetensors" stg_mode: "attention_values" # options: "attention_values", "attention_skip", "residual", "transformer_block" decode_timestep: 0.05 decode_noise_scale: 0.025 diff --git a/ltx_video/pipelines/pipeline_ltx_video.py b/ltx_video/pipelines/pipeline_ltx_video.py index 243f4a3..f467533 100644 --- a/ltx_video/pipelines/pipeline_ltx_video.py +++ b/ltx_video/pipelines/pipeline_ltx_video.py @@ -790,6 +790,7 @@ class LTXVideoPipeline(DiffusionPipeline): text_encoder_max_tokens: int = 256, stochastic_sampling: bool = False, media_items: Optional[torch.Tensor] = None, + tone_map_compression_ratio: float = 0.0, **kwargs, ) -> Union[ImagePipelineOutput, Tuple]: """ @@ -871,6 +872,8 @@ class LTXVideoPipeline(DiffusionPipeline): If set to `True`, the sampling is stochastic. If set to `False`, the sampling is deterministic. media_items ('torch.Tensor', *optional*): The input media item used for image-to-image / video-to-video. + tone_map_compression_ratio: compression ratio for tone mapping, defaults to 0.0. + If set to 0.0, no tone mapping is applied. If set to 1.0 - full compression is applied. Examples: Returns: @@ -1320,6 +1323,7 @@ class LTXVideoPipeline(DiffusionPipeline): ) else: decode_timestep = None + latents = self.tone_map_latents(latents, tone_map_compression_ratio) image = vae_decode( latents, self.vae, @@ -1741,6 +1745,47 @@ class LTXVideoPipeline(DiffusionPipeline): num_frames = (num_frames - 1) // scale_factor * scale_factor + 1 return num_frames + @staticmethod + def tone_map_latents( + latents: torch.Tensor, + compression: float, + ) -> torch.Tensor: + """ + Applies a non-linear tone-mapping function to latent values to reduce their dynamic range + in a perceptually smooth way using a sigmoid-based compression. + + This is useful for regularizing high-variance latents or for conditioning outputs + during generation, especially when controlling dynamic behavior with a `compression` factor. + + Parameters: + ---------- + latents : torch.Tensor + Input latent tensor with arbitrary shape. Expected to be roughly in [-1, 1] or [0, 1] range. + compression : float + Compression strength in the range [0, 1]. + - 0.0: No tone-mapping (identity transform) + - 1.0: Full compression effect + + Returns: + ------- + torch.Tensor + The tone-mapped latent tensor of the same shape as input. + """ + if not (0 <= compression <= 1): + raise ValueError("Compression must be in the range [0, 1]") + + # Remap [0-1] to [0-0.75] and apply sigmoid compression in one shot + scale_factor = compression * 0.75 + abs_latents = torch.abs(latents) + + # Sigmoid compression: sigmoid shifts large values toward 0.2, small values stay ~1.0 + # When scale_factor=0, sigmoid term vanishes, when scale_factor=0.75, full effect + sigmoid_term = torch.sigmoid(4.0 * scale_factor * (abs_latents - 1.0)) + scales = 1.0 - 0.8 * scale_factor * sigmoid_term + + filtered = latents * scales + return filtered + def adain_filter_latent( latents: torch.Tensor, reference_latents: torch.Tensor, factor=1.0