import gradio as gr import random import os import spaces import torch import time import json import numpy as np from diffusers import BriaFiboPipeline from diffusers.modular_pipelines import ModularPipeline from optimization import optimize_pipeline_ # resolutions=[ # "832 1248", # "896 1152", # "960 1088", # "1024 1024", # "1088 960", # "1152 896", # "1216 832", # "1280 800", # "1344 768", # ] MAX_SEED = np.iinfo(np.int32).max dtype = torch.bfloat16 device = "cuda" if torch.cuda.is_available() else "cpu" torch.set_grad_enabled(False) vlm_pipe = ModularPipeline.from_pretrained("briaai/FIBO-VLM-prompt-to-JSON", trust_remote_code=True).to(device) pipe = BriaFiboPipeline.from_pretrained( "briaai/FIBO", trust_remote_code=True, torch_dtype=dtype).to(device) test_prompt_json = """ { "short_description": "A surreal and whimsical scene featuring a man, a woman, and a dog posed against a tri-colored backdrop. The woman stands in front of the red section, wearing a t-shirt with a Yoda motif and a skirt with birds. The dog, dressed as a superdog, sits on a checkerboard chair in front of the white section, with a blue tennis ball in its mouth. The man, in a purple suit, stands in front of the gold section, holding a tree branch with a blue jay. The backdrop is divided into red, white, and gold sections, with a small metal grating in the top left and a tear in the gold section. A rustic framed oil painting of the pyramids hangs above the dog.", "objects": [ { "description": "A woman standing in front of the red backdrop. She is wearing a beige t-shirt with a Yoda motif and a long skirt with birds on it. Her right hand is holding an axe.", "location": "Center-left", "relationship": "She is positioned in front of the red backdrop and to the left of the dog and man.", "relative_size": "Medium", "shape_and_color": "Humanoid shape, beige and multicolored clothing.", "appearance_details": "She has a long skirt with birds on it and is holding an axe.", "pose": "Standing upright with a slight tilt to the right.", "expression": "Neutral", "clothing": "She is wearing a beige t-shirt with a Yoda motif and a long skirt with birds on it.", "action": "Standing", "gender": "Female", "skin_tone_and_texture": "Fair, smooth." }, { "description": "A dog dressed as a superdog, sitting on a checkerboard chair in front of the white backdrop. It has a blue tennis ball in its mouth.", "location": "Center", "relationship": "It is positioned in front of the white backdrop and between the woman and the man.", "relative_size": "Medium", "shape_and_color": "Canine shape, brown and white fur, blue tennis ball.", "appearance_details": "It is dressed as a superdog and has a blue tennis ball in its mouth.", "pose": "Sitting upright.", "expression": "Neutral", "clothing": "Superdog costume.", "action": "Sitting", "gender": "Male", "skin_tone_and_texture": "Brown and white fur, soft." }, { "description": "A man standing in front of the gold backdrop. He is wearing a three piece purple suit and has spiky blue hair. His left hand is holding a tree branch with a blue jay on it.", "location": "Center-right", "relationship": "He is positioned in front of the gold backdrop and to the right of the woman and dog.", "relative_size": "Medium", "shape_and_color": "Humanoid shape, purple suit, blue hair.", "appearance_details": "He has spiky blue hair and is holding a tree branch with a blue jay on it.", "pose": "Standing upright with a slight tilt to the left.", "expression": "Neutral", "clothing": "He is wearing a three piece purple suit.", "action": "Standing", "gender": "Male", "skin_tone_and_texture": "Fair, smooth." }, { "description": "A checkerboard armchair in yellow and brown.", "location": "Bottom-center", "relationship": "The dog is sitting on the chair.", "relative_size": "Small", "shape_and_color": "Chair shape, yellow and brown.", "texture": "Smooth. End of texture answer.", "appearance_details": "The chair is a checkerboard armchair in yellow and brown." }, { "description": "A rustic framed oil painting of the pyramids.", "location": "Top-center", "relationship": "The painting is hanging above the dog.", "relative_size": "Small", "shape_and_color": "Rectangular shape, brown frame, yellow and brown pyramids.", "texture": "Rough. End of texture answer.", "appearance_details": "The painting is a rustic framed oil painting of the pyramids." } ], "background_setting": "The background is a tri-colored backdrop divided equally into red, white, and gold sections. There is a small rectangular metal grating in the top left corner and a subtle tear in the gold backdrop in the bottom right corner.", "lighting": { "conditions": "Studio lighting", "direction": "Front-lit", "shadows": "Soft shadows are present, indicating diffused lighting." }, "aesthetics": { "composition": "The composition is centered, with the three figures arranged in a row. The backdrop is divided into thirds, creating a symmetrical balance.", "color_scheme": "The color scheme is triadic, with red, white, and gold dominating the backdrop, complemented by the various colors of the figures' clothing and accessories.", "mood_atmosphere": "The mood is whimsical and surreal, with a touch of humor due to the unusual costumes and props.", "preference_score": "high", "aesthetic_score": "high" }, "photographic_characteristics": { "depth_of_field": "Deep", "focus": "Sharp focus on all subjects", "camera_angle": "Eye-level", "lens_focal_length": "Standard" }, "style_medium": "Photograph", "text_render": [ { "text": "Yoda", "location": "Center of the woman's t-shirt", "size": "Small", "color": "Beige", "font": "Cartoonish", "appearance_details": "The text is part of a graphic design on the t-shirt." } ], "context": "This is a surreal and whimsical portrait of a man, a woman, and a dog posed against a tri-colored backdrop. It could be an art piece or a promotional image for a quirky event or product.", "artistic_style": "Surreal Pop" } """ optimize_pipeline_(pipe, test_prompt_json) def handle_json(text): try: json.loads(text) return text except: return "Error" @spaces.GPU(duration=100) def infer(prompt, negative_prompt="", seed=42, randomize_seed=False, width=1024, height=1024, guidance_scale=5, num_inference_steps=50, ): if randomize_seed: seed = random.randint(0, MAX_SEED) t=time.time() with torch.inference_mode(): # 1. Create a prompt to generate an initial image output = vlm_pipe(prompt=prompt) json_prompt = output.values["json_prompt"] image = pipe(prompt=json_prompt, num_inference_steps=num_inference_steps, negative_prompt=negative_prompt, width=width,height=height, guidance_scale=guidance_scale).images[0] return image, json_prompt css = """ #col-container{ margin: 0 auto; max-width: 768px; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.Markdown("## FOBI") with gr.Group(): with gr.Column(): with gr.Row(): prompt_in = gr.Textbox(label="Prompt") prompt_in_json = gr.JSON(label="Json") submit_btn = gr.Button("Generate") result = gr.Image(label="output") with gr.Accordion("Advanced Settings", open=False): with gr.Row(): seed = gr.Slider( label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=0, ) randomize_seed = gr.Checkbox(label="Randomize seed", value=True) with gr.Row(): guidance_scale = gr.Slider( label="guidance scale", minimum=1.0, maximum=10.0, step=0.1, value=5.0 ) num_inference_steps = gr.Slider( label="number of inference steps", minimum=1, maximum=60, step=1, value=50, ) height = gr.Slider( label="Height", minimum=768, maximum=1248, step=32, value=1024, ) width = gr.Slider( label="Width", minimum=832, maximum=1344, step=64, value=1024, ) with gr.Row(): negative_prompt = gr.Textbox(label="negative prompt", value=json.dumps('')) negative_prompt_json = gr.JSON(label="json negative prompt", value=json.dumps('')) # prompt_in.change( # handle_json, # inputs=prompt_in, # outputs=prompt_in_json) # negative_prompt.change(handle_json, inputs=negative_prompt, outputs=negative_prompt_json) submit_btn.click( fn = infer, inputs = [ prompt_in, negative_prompt, seed, randomize_seed, width, height, guidance_scale, num_inference_steps, ], outputs = [ result, prompt_in_json ] ) demo.queue().launch()