Spaces:
Running
Running
| import streamlit as st | |
| # Set the page layout to 'wide' | |
| st.set_page_config(layout="wide") | |
| import requests | |
| from PIL import Image | |
| from io import BytesIO | |
| # from IPython.display import display | |
| import base64 | |
| import time | |
| import random | |
| # helper decoder | |
| def decode_base64_image(image_string): | |
| base64_image = base64.b64decode(image_string) | |
| buffer = BytesIO(base64_image) | |
| return Image.open(buffer) | |
| # display PIL images as grid | |
| def display_image(image=None,width=500,height=500): | |
| img = image.resize((width, height)) | |
| return img | |
| def pretty_print(messages): | |
| for message in messages: | |
| return f"{message['role']}: {message['content']}" | |
| # API Gateway endpoint URL | |
| api_url = 'https://a02q342s5b.execute-api.us-east-2.amazonaws.com/reinvent-demo-inf2-sm-20231114' | |
| # # Define the CSS to change the text input background color | |
| # input_field_style = """ | |
| # <style> | |
| # /* Customize the text input field background and text color */ | |
| # .stTextInput input { | |
| # background-color: #fbd8bf; /* 'Rind' color */ | |
| # color: #232F3E; /* Dark text color */ | |
| # } | |
| # /* You might also want to change the color for textarea if you're using it */ | |
| # .stTextArea textarea { | |
| # background-color: #fbd8bf; /* 'Rind' color */ | |
| # color: #232F3E; /* Dark text color */ | |
| # } | |
| # </style> | |
| # """ | |
| # # Inject custom styles into the Streamlit app | |
| # st.markdown(input_field_style, unsafe_allow_html=True) | |
| # Creating Tabs | |
| tab1, tab2, tab3, tab4 = st.tabs(["Image Generation", "Architecture", "Stable Diffusion Architecture", "Code"]) | |
| with tab1: | |
| # Create two columns for layout | |
| left_column, right_column = st.columns(2) | |
| with right_column: | |
| cont = st.container() | |
| # =========== | |
| with left_column: | |
| # Define Streamlit UI elements | |
| st.title('Stable Diffusion XL Image Generation with AWS Inferentia2') | |
| sample_prompts = [ | |
| "A futuristic cityscape at sunset, cyberpunk", | |
| "A serene landscape with mountains and a river, photorealistic style", | |
| "An astronaut riding a horse, artistic and surreal", | |
| "A robot playing chess in a medieval setting, high detail", | |
| "An underwater scene with colorful coral reefs and fish, vibrant colors", | |
| "Raccoon astronaut in space, sci-fi, future, cold color palette, muted colors, detailed, 8k", | |
| "A lost city rediscovered in the Amazon jungle, overgrown with plants, in the style of a vintage travel poster", | |
| "A steampunk train emitting clouds of steam as it races through a mountain pass, digital art", | |
| "An enchanted forest with bioluminescent trees and fairies dancing, in a Studio Ghibli style", | |
| "A portrait of an elegant alien empress with a detailed headdress, reminiscent of Art Nouveau", | |
| "A post-apocalyptic Tokyo with nature reclaiming skyscrapers, in the style of a concept art", | |
| "A mythical phoenix rising from ashes, vibrant colors, with a nebula in the background", | |
| "A cybernetic wolf in a neon-lit city, cyberpunk theme, rain-drenched streets", | |
| "A high fantasy battle scene with dragons in the sky and knights on the ground, epic scale", | |
| "An ice castle on a lonely mountain peak, under the northern lights, fantasy illustration", | |
| "A surreal landscape where giant flowers bloom in the desert, with a distant thunderstorm, hyperrealism" | |
| ] | |
| def set_random_prompt(): | |
| # This function will be called when the button is clicked | |
| random_prompt = random.choice(sample_prompts) | |
| # Update the session state for the input field | |
| st.session_state.prompt_one = random_prompt | |
| prompt_one = st.text_area("Enter your prompt:", | |
| key="prompt_one") | |
| st.button('Random Prompt', on_click=set_random_prompt) | |
| # Number of inference steps | |
| num_inference_steps_one = st.slider("Number of Inference Steps", | |
| min_value=1, | |
| max_value=100, | |
| value=15, | |
| help="More steps might improve quality, with diminishing marginal returns. 30-50 seems best, but your mileage may vary.") | |
| # Create an expandable section for optional parameters | |
| with st.expander("Optional Parameters"): | |
| # Random seed input | |
| seed_one = st.number_input("Random seed", | |
| value=555, | |
| help="Set to the same value to generate the same image if other inputs are the same, change to generate a different image for same inputs.") | |
| # Negative prompt input | |
| negative_prompt_one = st.text_area("Enter your negative prompt:", | |
| "cartoon, graphic, text, painting, crayon, graphite, abstract glitch, blurry") | |
| if st.button('Generate Image'): | |
| with st.spinner(f'Generating Image with {num_inference_steps_one} iterations'): | |
| start_time = time.time() | |
| # =============== | |
| # Example input data | |
| prompt_input_one = { | |
| "prompt": prompt_one, | |
| "parameters": { | |
| "num_inference_steps": num_inference_steps_one, | |
| "seed": seed_one, | |
| "negative_prompt": negative_prompt_one | |
| }, | |
| "endpoint": "huggingface-pytorch-inference-neuronx-2023-11-14-21-22-10-388" | |
| } | |
| # Make API request | |
| response_one = requests.post(api_url, json=prompt_input_one) | |
| # Process and display the response | |
| if response_one.status_code == 200: | |
| result_one = response_one.json() | |
| # st.success(f"Prediction result: {result}") | |
| image_one = display_image(decode_base64_image(result_one["generated_images"][0])) | |
| cont.image(image_one, | |
| caption=f"{prompt_one}") | |
| end_time = time.time() | |
| total_time = round(end_time - start_time, 2) | |
| cont.text(f"Prompt: {prompt_one}") | |
| cont.text(f"Number of Iterations: {num_inference_steps_one}") | |
| cont.text(f"Random Seed: {seed_one}") | |
| cont.text(f'Total time taken: {total_time} seconds') | |
| # Calculate and display the time per iteration in milliseconds | |
| time_per_iteration_ms = (total_time / num_inference_steps_one) | |
| cont.text(f'Time per iteration: {time_per_iteration_ms:.2f} seconds') | |
| else: | |
| st.error(f"Error: {response_one.text}") | |
| # with pass: | |
| # st.title('Llama 2 7B Text Generation with AWS Inferentia 2') | |
| # params = { | |
| # "do_sample" : True, | |
| # "top_p": 0.6, | |
| # "temperature": 0.9, | |
| # "top_k": 50, | |
| # "max_new_tokens": 512, | |
| # "repetition_penalty": 1.03, | |
| # } | |
| # if "messages" not in st.session_state: | |
| # st.session_state.messages = [ | |
| # {"role": "system", "content": "You are a helpful Travel Planning Assistant. You respond with only 1-2 sentences."}, | |
| # {'role': 'user', 'content': 'Where can I travel in the fall for cloudy, rainy, and beautiful views?'}, | |
| # ] | |
| # for message in st.session_state.messages: | |
| # with st.chat_message(message["role"]): | |
| # st.markdown(message["content"]) | |
| # with st.chat_message("assistant"): | |
| # message_placeholder = st.empty() | |
| # full_response = "" | |
| # prompt_input_one = { | |
| # "prompt": st.session_state.messages, | |
| # "parameters": params, | |
| # "endpoint": "huggingface-pytorch-inference-neuronx-2023-11-28-16-09-51-708" | |
| # } | |
| # response_one = requests.post(api_url, json=prompt_input_one) | |
| # if response_one.status_code == 200: | |
| # result_one = response_one.json() | |
| # # st.success(f"Prediction result: {result}") | |
| # full_response += result_one["generation"] | |
| # else: | |
| # st.error(f"Error: {response_one.text}") | |
| # message_placeholder.markdown(full_response) | |
| # st.session_state.messages.append({"role": "assistant", "content": full_response}) | |
| # if prompt := st.chat_input("What is up?"): | |
| # st.session_state.messages.append({"role": "user", "content": prompt}) | |
| # print(st.session_state.messages) | |
| # with st.chat_message("user"): | |
| # st.markdown(prompt) | |
| # with st.chat_message("assistant"): | |
| # message_placeholder = st.empty() | |
| # new_response = "" | |
| # prompt_input_one = { | |
| # "prompt": st.session_state.messages, | |
| # "parameters": params, | |
| # "endpoint": "huggingface-pytorch-inference-neuronx-2023-11-28-16-09-51-708" | |
| # } | |
| # response_one = requests.post(api_url, json=prompt_input_one) | |
| # if response_one.status_code == 200: | |
| # result_one = response_one.json() | |
| # # st.success(f"Prediction result: {result}") | |
| # new_response += result_one["generation"] | |
| # else: | |
| # st.error(f"Error: {response_one.text}") | |
| # message_placeholder.markdown(new_response) | |
| # st.session_state.messages.append({"role": "assistant", "content": new_response}) | |
| pass | |
| with tab2: | |
| # =========== | |
| left_column, _, right_column = st.columns([2,.2,3]) | |
| with right_column: | |
| # Define Streamlit UI elements | |
| st.markdown("""<br>""", unsafe_allow_html=True) | |
| st.markdown("""<br>""", unsafe_allow_html=True) | |
| st.markdown("""<br>""", unsafe_allow_html=True) | |
| st.markdown("""<br>""", unsafe_allow_html=True) | |
| st.markdown("""<br>""", unsafe_allow_html=True) | |
| st.image('./architecture.png', caption=f"Application Architecture") | |
| with left_column: | |
| st.write("## Architecture Overview") | |
| st.write("This diagram illustrates the architecture of our Generative AI service, which is composed of several interconnected AWS services, notable Amazon Elastic Compute Cloud (Amazon EC2). Here's a detailed look at each component:") | |
| with st.expander("(1) Inference Models"): | |
| st.markdown(""" | |
| - The architecture starts with our trained machine learning models hosted on Amazon SageMaker, running on AWS Inferentia 2 instance (`inf2.xlarge`). | |
| - There are two models shown here, Stable Diffusion XL for image generation, and Llama 2 7B for text generation. | |
| """) | |
| with st.expander("(2) Amazon SageMaker Endpoints"): | |
| st.markdown(""" | |
| - The models are exposed via SageMaker Endpoints, which provide scalable and secure real-time inference services. | |
| - These endpoints are the interfaces through which the models receive input data and return predictions. | |
| """) | |
| with st.expander("(3) AWS Lambda"): | |
| st.markdown(""" | |
| - AWS Lambda functions serve as the middle layer, handling the logic of communicating with the SageMaker Endpoints. | |
| - Lambda can process the incoming requests, perform any necessary transformations, call the endpoints, and then process the results before sending them back. | |
| """) | |
| with st.expander("(4) Amazon API Gateway"): | |
| st.markdown(""" | |
| - The processed results from Lambda are then routed through Amazon API Gateway. | |
| - API Gateway acts as a front door to manage all incoming API requests, including authorization, throttling, and CORS handling. | |
| """) | |
| with st.expander("(5) Streamlit Frontend"): | |
| st.markdown(""" | |
| - Finally, our Streamlit application provides a user-friendly interface for end-users to interact with the service. | |
| - It sends requests to the API Gateway and displays the returned predictions from the machine learning models. | |
| """) | |
| st.write(""" | |
| In summary, this architecture enables a scalable, serverless, and responsive Generative AI service that can serve real-time predictions to users directly from a web interface. | |
| """) | |
| with tab3: | |
| left_column, _, right_column = st.columns([2,.2,3]) | |
| with right_column: | |
| # Define Streamlit UI elements | |
| st.markdown("""<br>""", unsafe_allow_html=True) | |
| st.image('./sdxl_arch.png', caption=f"SDXL Architecture") | |
| with left_column: | |
| st.write("## SDXL Architecture Overview") | |
| st.write(""" | |
| The stable diffusion model takes both a latent seed and a text prompt as an input. The latent seed is then used to generate random latent image representations of size 64×64 where as the text prompt is transformed to text embeddings of size 77×768 via CLIP's text encoder. | |
| Next the U-Net iteratively denoises the random latent image representations while being conditioned on the text embeddings. The output of the U-Net, being the noise residual, is used to compute a denoised latent image representation via a scheduler algorithm. Many different scheduler algorithms can be used for this computation, each having its pro- and cons. | |
| Theory on how the scheduler algorithm function is out-of-scope for this demo, but in short one should remember that they compute the predicted denoised image representation from the previous noise representation and the predicted noise residual. | |
| The denoising process is repeated ca. 50 times to step-by-step retrieve better latent image representations. Once complete, the latent image representation is decoded by the decoder part of the variational auto encoder. | |
| """) | |
| with tab4: | |
| with st.expander("(1) Deploy GenAI Model to AWS Inferentia 2 Instance and Amazon SageMaker Endpoint"): | |
| st.markdown( | |
| """ | |
| [Source] This code is modified from this fantastic blog by Phil Schmid at HuggingFace: https://www.philschmid.de/inferentia2-stable-diffusion-xl | |
| # Deploy Stable Diffusion on AWS inferentia2 with Amazon SageMaker | |
| In this end-to-end tutorial, you will learn how to deploy and speed up Stable Diffusion XL inference using AWS Inferentia2 and [optimum-neuron](https://huggingface.co/docs/optimum-neuron/index) on Amazon SageMaker. [Optimum Neuron](https://huggingface.co/docs/optimum-neuron/index) is the interface between the Hugging Face Transformers & Diffusers library and AWS Accelerators including AWS Trainium and AWS Inferentia2. | |
| You will learn how to: | |
| 1. Convert Stable Diffusion XL to AWS Neuron (Inferentia2) with `optimum-neuron` | |
| 2. Create a custom `inference.py` script for Stable Diffusion | |
| 3. Upload the neuron model and inference script to Amazon S3 | |
| 4. Deploy a Real-time Inference Endpoint on Amazon SageMaker | |
| 5. Generate images using the deployed model | |
| ## Quick intro: AWS Inferentia 2 | |
| [AWS inferentia (Inf2)](https://aws.amazon.com/de/ec2/instance-types/inf2/) are purpose-built EC2 for deep learning (DL) inference workloads. Inferentia 2 is the successor of [AWS Inferentia](https://aws.amazon.com/ec2/instance-types/inf1/?nc1=h_ls), which promises to deliver up to 4x higher throughput and up to 10x lower latency. | |
| | instance size | accelerators | Neuron Cores | accelerator memory | vCPU | CPU Memory | on-demand price ($/h) | | |
| | ------------- | ------------ | ------------ | ------------------ | ---- | ---------- | --------------------- | | |
| | inf2.xlarge | 1 | 2 | 32 | 4 | 16 | 0.76 | | |
| | inf2.8xlarge | 1 | 2 | 32 | 32 | 128 | 1.97 | | |
| | inf2.24xlarge | 6 | 12 | 192 | 96 | 384 | 6.49 | | |
| | inf2.48xlarge | 12 | 24 | 384 | 192 | 768 | 12.98 | | |
| Additionally, inferentia 2 will support the writing of custom operators in c++ and new datatypes, including `FP8` (cFP8). | |
| Let's get started! 🚀 | |
| *If you are going to use Sagemaker in a local environment (not SageMaker Studio or Notebook Instances). You need access to an IAM Role with the required permissions for Sagemaker. You can find [here](https://docs.aws.amazon.com/sagemaker/latest/dg/sagemaker-roles.html) more about it.* | |
| ## 1. Convert Stable Diffusion to AWS Neuron (Inferentia2) with `optimum-neuron` | |
| We are going to use the [optimum-neuron](https://huggingface.co/docs/optimum-neuron/index) to compile/convert our model to neuronx. Optimum Neuron provides a set of tools enabling easy model loading, training and inference on single- and multi-Accelerator settings for different downstream tasks. | |
| As a first step, we need to install the `optimum-neuron` and other required packages. | |
| *Tip: If you are using Amazon SageMaker Notebook Instances or Studio you can go with the `conda_python3` conda kernel.* | |
| ```python | |
| # Install the required packages | |
| %pip install "optimum-neuron==0.0.13" "diffusers==0.21.4" --upgrade | |
| %pip install "sagemaker>=2.197.0" --upgrade | |
| ``` | |
| After we have installed the `optimum-neuron` we can convert load and convert our model. | |
| We are going to use the [stabilityai/stable-diffusion-xl-base-1.0](hstabilityai/stable-diffusion-xl-base-1.0) model. Stable Diffusion XL (SDXL) from [Stability AI](https://stability.ai/) is the newset text-to-image generation model, which can create photorealistic images with detailed imagery and composition compared to previous SD models, including SD 2.1. | |
| At the time of writing, the [AWS Inferentia2 does not support dynamic shapes for inference](https://awsdocs-neuron.readthedocs-hosted.com/en/latest/general/arch/neuron-features/dynamic-shapes.html?highlight=dynamic%20shapes#), which means that the we need to specify our image size in advanced for compiling and inference. | |
| In simpler terms, this means we need to define the input shapes for our prompt (sequence length), batch size, height and width of the image. | |
| We precompiled the model with the following parameters and pushed it to the Hugging Face Hub: | |
| * `height`: 1024 | |
| * `width`: 1024 | |
| * `sequence_length`: 128 | |
| * `num_images_per_prompt`: 1 | |
| * `batch_size`: 1 | |
| * `neuron`: 2.15.0 | |
| _Note: If you want to compile your own model or a different Stable Diffusion XL checkpoint you need to use ~120GB of memory and the compilation can take ~45 minutes. We used an `inf2.8xlarge` ec2 instance with the [Hugging Face Neuron Deep Learning AMI](https://aws.amazon.com/marketplace/pp/prodview-gr3e6yiscria2) to compile the model._ | |
| ```python | |
| from huggingface_hub import snapshot_download | |
| # compiled model id | |
| compiled_model_id = "aws-neuron/stable-diffusion-xl-base-1-0-1024x1024" | |
| # save compiled model to local directory | |
| save_directory = "sdxl_neuron" | |
| # Downloads our compiled model from the HuggingFace Hub | |
| # using the revision as neuron version reference | |
| # and makes sure we exlcude the symlink files and "hidden" files, like .DS_Store, .gitignore, etc. | |
| snapshot_download(compiled_model_id, revision="2.15.0", local_dir=save_directory, local_dir_use_symlinks=False, allow_patterns=["[!.]*.*"]) | |
| ############################################### | |
| # COMMENT IN BELOW TO COMPILE DIFFERENT MODEL # | |
| ############################################### | |
| # | |
| # from optimum.neuron import NeuronStableDiffusionXLPipeline | |
| # | |
| # # model id you want to compile | |
| # vanilla_model_id = "stabilityai/stable-diffusion-xl-base-1.0" | |
| # | |
| # # configs for compiling model | |
| # compiler_args = {"auto_cast": "all", "auto_cast_type": "bf16"} | |
| # input_shapes = { | |
| # "height": 1024, # width of the image | |
| # "width": 1024, # height of the image | |
| # "num_images_per_prompt": 1, # number of images to generate per prompt | |
| # "batch_size": 1 # batch size for the model | |
| # } | |
| # | |
| # sd = NeuronStableDiffusionXLPipeline.from_pretrained(vanilla_model_id, export=True, **input_shapes, **compiler_args) | |
| # | |
| # # Save locally or upload to the HuggingFace Hub | |
| # save_directory = "sdxl_neuron" | |
| # sd.save_pretrained(save_directory) | |
| ``` | |
| ## 2. Create a custom `inference.py` script for Stable Diffusion | |
| The [Hugging Face Inference Toolkit](https://github.com/aws/sagemaker-huggingface-inference-toolkit) supports zero-code deployments on top of the [pipeline feature](https://huggingface.co/transformers/main_classes/pipelines.html) from 🤗 Transformers. This allows users to deploy Hugging Face transformers without an inference script [[Example](https://github.com/huggingface/notebooks/blob/master/sagemaker/11_deploy_model_from_hf_hub/deploy_transformer_model_from_hf_hub.ipynb)]. | |
| Currently is this feature not supported with AWS Inferentia2, which means we need to provide an `inference.py` for running inference. But `optimum-neuron` has integrated support for the 🤗 Diffusers pipeline feature. That way we can use the `optimum-neuron` to create a pipeline for our model. | |
| If you want to know more about the `inference.py` script check out this [example](https://github.com/huggingface/notebooks/blob/master/sagemaker/17_custom_inference_script/sagemaker-notebook.ipynb). It explains amongst other things what the `model_fn` and `predict_fn` are. | |
| ```python | |
| # create code directory in our model directory | |
| !mkdir {save_directory}/code | |
| ``` | |
| We are using the `NEURON_RT_NUM_CORES=2` to make sure that each HTTP worker uses 2 Neuron core to maximize throughput. | |
| ```python | |
| %%writefile {save_directory}/code/inference.py | |
| import os | |
| # To use two neuron core per worker | |
| os.environ["NEURON_RT_NUM_CORES"] = "2" | |
| import torch | |
| import torch_neuronx | |
| import base64 | |
| from io import BytesIO | |
| from optimum.neuron import NeuronStableDiffusionXLPipeline | |
| def model_fn(model_dir): | |
| # load local converted model into pipeline | |
| pipeline = NeuronStableDiffusionXLPipeline.from_pretrained(model_dir, device_ids=[0, 1]) | |
| return pipeline | |
| def predict_fn(data, pipeline): | |
| # extract prompt from data | |
| prompt = data.pop("inputs", data) | |
| parameters = data.pop("parameters", None) | |
| if parameters is not None: | |
| generated_images = pipeline(prompt, **parameters)["images"] | |
| else: | |
| generated_images = pipeline(prompt)["images"] | |
| # postprocess convert image into base64 string | |
| encoded_images = [] | |
| for image in generated_images: | |
| buffered = BytesIO() | |
| image.save(buffered, format="JPEG") | |
| encoded_images.append(base64.b64encode(buffered.getvalue()).decode()) | |
| # always return the first | |
| return {"generated_images": encoded_images} | |
| ``` | |
| ## 3. Upload the neuron model and inference script to Amazon S3 | |
| Before we can deploy our neuron model to Amazon SageMaker we need to upload it all our model artifacts to Amazon S3. | |
| _Note: Currently `inf2` instances are only available in the `us-east-2` & `us-east-1` region [[REF](https://aws.amazon.com/de/about-aws/whats-new/2023/05/sagemaker-ml-inf2-ml-trn1-instances-model-deployment/)]. Therefore we need to force the region to us-east-2._ | |
| Lets create our SageMaker session and upload our model to Amazon S3. | |
| ```python | |
| import sagemaker | |
| import boto3 | |
| sess = sagemaker.Session() | |
| # sagemaker session bucket -> used for uploading data, models and logs | |
| # sagemaker will automatically create this bucket if it not exists | |
| sagemaker_session_bucket=None | |
| if sagemaker_session_bucket is None and sess is not None: | |
| # set to default bucket if a bucket name is not given | |
| sagemaker_session_bucket = sess.default_bucket() | |
| try: | |
| role = sagemaker.get_execution_role() | |
| except ValueError: | |
| iam = boto3.client('iam') | |
| role = iam.get_role(RoleName='sagemaker_execution_role')['Role']['Arn'] | |
| sess = sagemaker.Session(default_bucket=sagemaker_session_bucket) | |
| print(f"sagemaker role arn: {role}") | |
| print(f"sagemaker bucket: {sess.default_bucket()}") | |
| print(f"sagemaker session region: {sess.boto_region_name}") | |
| assert sess.boto_region_name in ["us-east-2", "us-east-1"] , "region must be us-east-2 or us-west-2, due to instance availability" | |
| ``` | |
| We create our `model.tar.gz` with our `inference.py`` script | |
| ```python | |
| # create a model.tar.gz archive with all the model artifacts and the inference.py script. | |
| %cd {save_directory} | |
| !tar zcvf model.tar.gz * | |
| %cd .. | |
| ``` | |
| Next, we upload our `model.tar.gz` to Amazon S3 using our session bucket and `sagemaker` sdk. | |
| ```python | |
| from sagemaker.s3 import S3Uploader | |
| # create s3 uri | |
| s3_model_path = f"s3://{sess.default_bucket()}/neuronx/sdxl" | |
| # upload model.tar.gz | |
| s3_model_uri = S3Uploader.upload(local_path=f"{save_directory}/model.tar.gz", desired_s3_uri=s3_model_path) | |
| print(f"model artifcats uploaded to {s3_model_uri}") | |
| ``` | |
| ## 4. Deploy a Real-time Inference Endpoint on Amazon SageMaker | |
| After we have uploaded our model artifacts to Amazon S3 can we create a custom `HuggingfaceModel`. This class will be used to create and deploy our real-time inference endpoint on Amazon SageMaker. | |
| The `inf2.xlarge` instance type is the smallest instance type with AWS Inferentia2 support. It comes with 1 Inferentia2 chip with 2 Neuron Cores. This means we can use 2 Neuron Cores to minimize latency for our image generation. | |
| ```python | |
| from sagemaker.huggingface.model import HuggingFaceModel | |
| # create Hugging Face Model Class | |
| huggingface_model = HuggingFaceModel( | |
| model_data=s3_model_uri, # path to your model.tar.gz on s3 | |
| role=role, # iam role with permissions to create an Endpoint | |
| transformers_version="4.34.1", # transformers version used | |
| pytorch_version="1.13.1", # pytorch version used | |
| py_version='py310', # python version used | |
| model_server_workers=1, # number of workers for the model server | |
| ) | |
| # deploy the endpoint endpoint | |
| predictor = huggingface_model.deploy( | |
| initial_instance_count=1, # number of instances | |
| instance_type="ml.inf2.xlarge", # AWS Inferentia Instance | |
| volume_size = 100 | |
| ) | |
| # ignore the "Your model is not compiled. Please compile your model before using Inferentia." warning, we already compiled our model. | |
| ``` | |
| # 5.Generate images using the deployed model | |
| The `.deploy()` returns an `HuggingFacePredictor` object which can be used to request inference. Our endpoint expects a `json` with at least `inputs` key. The `inputs` key is the input prompt for the model, which will be used to generate the image. Additionally, we can provide inference parameters, e.g. `num_inference_steps`. | |
| The `predictor.predict()` function returns a `json` with the `generated_images` key. The `generated_images` key contains the `1` generated image as a `base64` encoded string. To decode our response we added a small helper function `decode_base64_to_image` which takes the `base64` encoded string and returns a `PIL.Image` object and `display_image` displays them. | |
| ```python | |
| from PIL import Image | |
| from io import BytesIO | |
| from IPython.display import display | |
| import base64 | |
| # helper decoder | |
| def decode_base64_image(image_string): | |
| base64_image = base64.b64decode(image_string) | |
| buffer = BytesIO(base64_image) | |
| return Image.open(buffer) | |
| # display PIL images as grid | |
| def display_image(image=None,width=500,height=500): | |
| img = image.resize((width, height)) | |
| display(img) | |
| ``` | |
| Now, lets generate some images. As example `A dog trying catch a flying pizza in style of comic book, at a street corner.`. Generating an image with 25 steps takes around ~6 seconds, except for the first request which can take 45-60s. | |
| _note: If the request times out, just rerun again. Only the first request takes a long time._ | |
| ```python | |
| prompt = "A dog trying catch a flying pizza at a street corner, comic book, well lit, night time" | |
| # run prediction | |
| response = predictor.predict(data={ | |
| "inputs": prompt, | |
| "parameters": { | |
| "num_inference_steps" : 25, | |
| "negative_prompt" : "disfigured, ugly, deformed" | |
| } | |
| } | |
| ) | |
| # decode and display image | |
| display_image(decode_base64_image(response["generated_images"][0])) | |
| ``` | |
| ### Delete model and endpoint | |
| To clean up, we can delete the model and endpoint. | |
| ```python | |
| predictor.delete_model() | |
| predictor.delete_endpoint() | |
| ``` | |
| ```python | |
| ``` | |
| """ | |
| ) | |
| with st.expander("(2) AWS Lambda Function to handle inference requests"): | |
| st.markdown( | |
| """ | |
| ```python | |
| import boto3 | |
| import json | |
| def lambda_handler(event, context): | |
| # SageMaker endpoint details | |
| endpoint_name = 'INSERT_YOUR_SAGEMAKER_ENDPOINT_NAME_HERE' | |
| runtime = boto3.client('sagemaker-runtime') | |
| # Sample input data (modify as per your model's input requirements) | |
| # Get the prompt from the Lambda function input | |
| print("======== event payload: ==========") | |
| print(event['body']) | |
| print("======== prompt payload: ==========") | |
| event_parsed = json.loads(event['body']) | |
| prompt = event_parsed.get('prompt', '') | |
| print(prompt) | |
| print("======== params payload: ==========") | |
| params = event_parsed.get('parameters','') | |
| print(params) | |
| # Prepare input data | |
| model_input = { | |
| 'inputs': prompt, | |
| 'parameters': params | |
| } | |
| input_data = json.dumps(model_input) | |
| # Make a prediction request to the SageMaker endpoint | |
| response = runtime.invoke_endpoint(EndpointName=endpoint_name, | |
| ContentType='application/json', | |
| Body=input_data) | |
| # Parse the response | |
| result = response['Body'].read() | |
| return { | |
| 'statusCode': 200, | |
| 'body': result | |
| } | |
| ``` | |
| """ | |
| ) | |
| with st.expander("(3) Streamlit app.py, running on Amazon EC2 t2.micro instance"): | |
| st.markdown( | |
| """ | |
| ```python | |
| import streamlit as st | |
| # Set the page layout to 'wide' | |
| st.set_page_config(layout="wide") | |
| import requests | |
| from PIL import Image | |
| from io import BytesIO | |
| import base64 | |
| import time | |
| # helper decoder | |
| def decode_base64_image(image_string): | |
| base64_image = base64.b64decode(image_string) | |
| buffer = BytesIO(base64_image) | |
| return Image.open(buffer) | |
| # display PIL images as grid | |
| def display_image(image=None,width=500,height=500): | |
| img = image.resize((width, height)) | |
| return img | |
| # API Gateway endpoint URL | |
| api_url = 'INSERT_YOUR_API_GATEWAY_ENDPOINT_URL_HERE' | |
| # Create two columns for layout | |
| left_column, right_column = st.columns(2) | |
| # =========== | |
| with left_column: | |
| # Define Streamlit UI elements | |
| st.title('Stable Diffusion XL Image Generation with AWS Inferentia') | |
| prompt_one = st.text_area("Enter your prompt:", | |
| f"Raccoon astronaut in space, sci-fi, future, cold color palette, muted colors, detailed, 8k") | |
| # Number of inference steps | |
| num_inference_steps_one = st.slider("Number of Inference Steps", | |
| min_value=1, | |
| max_value=100, | |
| value=30, | |
| help="More steps might improve quality, with diminishing marginal returns. 30-50 seems best, but your mileage may vary.") | |
| # Create an expandable section for optional parameters | |
| with st.expander("Optional Parameters"): | |
| # Random seed input | |
| seed_one = st.number_input("Random seed", | |
| value=555, | |
| help="Set to the same value to generate the same image if other inputs are the same, change to generate a different image for same inputs.") | |
| # Negative prompt input | |
| negative_prompt_one = st.text_area("Enter your negative prompt:", | |
| "cartoon, graphic, text, painting, crayon, graphite, abstract glitch, blurry") | |
| if st.button('Generate Image'): | |
| with st.spinner(f'Generating Image with {num_inference_steps_one} iterations'): | |
| with right_column: | |
| start_time = time.time() | |
| # =============== | |
| # Example input data | |
| prompt_input_one = { | |
| "prompt": prompt_one, | |
| "parameters": { | |
| "num_inference_steps": num_inference_steps_one, | |
| "seed": seed_one, | |
| "negative_prompt": negative_prompt_one | |
| } | |
| } | |
| # Make API request | |
| response_one = requests.post(api_url, json=prompt_input_one) | |
| # Process and display the response | |
| if response_one.status_code == 200: | |
| result_one = response_one.json() | |
| # st.success(f"Prediction result: {result}") | |
| image_one = display_image(decode_base64_image(result_one["generated_images"][0])) | |
| st.image(image_one, | |
| caption=f"{prompt_one}") | |
| end_time = time.time() | |
| total_time = round(end_time - start_time, 2) | |
| st.text(f"Prompt: {prompt_one}") | |
| st.text(f"Number of Iterations: {num_inference_steps_one}") | |
| st.text(f"Random Seed: {seed_one}") | |
| st.text(f'Total time taken: {total_time} seconds') | |
| # Calculate and display the time per iteration in milliseconds | |
| time_per_iteration_ms = (total_time / num_inference_steps_one) | |
| st.text(f'Time per iteration: {time_per_iteration_ms:.2f} seconds') | |
| else: | |
| st.error(f"Error: {response_one.text}") | |
| ``` | |
| """ | |
| ) |