Missing image_encoder and image_processor
Hi,
Thank you for the wonderful work!
During the initial try, I met the issue as follows:
"""
ValueError: Pipeline <class 'diffusers.pipelines.wan.pipeline_wan_i2v.WanImageToVideoPipeline'> expected ['image_encoder', 'image_processor', 'scheduler', 'text_encoder', 'tokenizer', 'transformer', 'vae'], but only {'text_encoder', 'scheduler', 'tokenizer', 'vae', 'transformer'} were passed.
"""
I thought this might be due to the missing files of the image processor and the image encoder in the HuggingFace checkpoint. I wonder if there's a way to fix this to try the model.
Thank you for your help!
we will merge the PR https://github.com/huggingface/diffusers/pull/12004 soon
for now you need to install from the PR
pip install git+https://github.com/huggingface/[email protected]
Hello, I am still seeing the same issue when deploying this to SageMaker using this container 763104351884.dkr.ecr.us-east-1.amazonaws.com/huggingface-pytorch-inference:2.6.0-transformers4.51.3-gpu-py312-cu124-ubuntu22.04. I already have my requirements.txt and inference.py as below. Is there anything I am missing?
2025-12-18T20:56:14,518 [INFO ] W-9000-model-stdout com.amazonaws.ml.mms.wlm.WorkerLifeCycle - ValueError: Pipeline <class 'diffusers.pipelines.wan.pipeline_wan_i2v.WanImageToVideoPipeline'> expected ['image_encoder', 'image_processor', 'scheduler', 'text_encoder', 'tokenizer', 'transformer', 'vae'], but only {'vae', 'scheduler', 'text_encoder', 'tokenizer', 'transformer'} were passed.
torchvision==0.21.0
opencv-python==4.11.0.86
git+https://github.com/huggingface/[email protected]
transformers>=4.57.0
tokenizers==0.21.1
accelerate==1.4.0
peft==0.17.1
ftfy==6.3.1
ffmpeg==1.4
imageio==2.37.2
imageio-ffmpeg==0.6.0
import os
import time
import boto3
import torch
from botocore.exceptions import ClientError
from diffusers import AutoencoderKLWan, WanImageToVideoPipeline, WanTransformer3DModel, WanPipeline
from diffusers.utils import export_to_video, load_image, load_video
from PIL import Image
def upload_file(file_name, bucket, object_name=None):
# If S3 object_name was not specified, use file_name
if object_name is None:
object_name = os.path.basename(file_name)
# Upload the file
s3 = boto3.client('s3')
try:
s3.upload_file(file_name, bucket, object_name)
except ClientError as e:
print(e)
return False
return True
def model_fn(model_dir):
print("Loading transformers...")
# return pipe
vae = AutoencoderKLWan.from_pretrained(model_dir, subfolder="vae", torch_dtype=torch.float32)
pipe = WanImageToVideoPipeline.from_pretrained(model_dir, torch_dtype=torch.bfloat16, device_map="balanced")
return pipe
def predict_fn(data, pipe):
print("inference started")
bucket = data.pop("bucket")
file_name = data.pop("file_name", "model_output.mp4")
prompt = data.pop("prompt", "Summer beach vacation style, a white cat wearing sunglasses sits on a surfboard. The fluffy-furred feline gazes directly at the camera with a relaxed expression. Blurred beach scenery forms the background featuring crystal-clear waters, distant green hills, and a blue sky dotted with white clouds. The cat assumes a naturally relaxed posture, as if savoring the sea breeze and warm sunlight. A close-up shot highlights the feline's intricate details and the refreshing atmosphere of the seaside.")
negative_prompt = data.pop("negative_prompt", "Bright tones, overexposed, static, blurred details, subtitles, style, works, paintings, images, static, overall gray, worst quality, low quality, JPEG compression residue, ugly, incomplete, extra fingers, poorly drawn hands, poorly drawn faces, deformed, disfigured, misshapen limbs, fused fingers, still picture, messy background, three legs, many people in the background, walking backwards")
height = int(data.pop("height", 480))
width = int(data.pop("width", 832))
num_frames = int(data.pop("num_frames", 17))
guidance_scale = float(data.pop("guidance_scale", 5.0))
fps = int(data.pop("fps", 15))
image = load_image("https://huggingface.co/datasets/YiYiXu/testing-images/resolve/main/wan_i2v_input.JPG")
start_time = time.perf_counter()
generator = torch.Generator(device="cuda").manual_seed(0)
output = pipe(
image=image,
prompt=prompt,
negative_prompt=negative_prompt,
height=height,
width=width,
num_frames=num_frames,
guidance_scale=guidance_scale,
num_inference_steps=40,
generator=generator
).frames[0]
end_time = time.perf_counter()
elapsed_time = end_time - start_time
print(f"Execution time - in PREDICT : {elapsed_time:.6f} seconds")
file_path = f"/tmp/{os.path.basename(file_name)}"
export_to_video(output, file_path, fps)
upload_file(file_path, bucket, file_name)
try:
os.remove(file_path)
except FileNotFoundError:
print(f"Error: File '{file_path}' not found.")
except Exception as e:
print(f"An error occurred: {e}")
return {"generated_video": f"s3://{bucket}/{file_name}"}