Update app.py
Browse files
app.py
CHANGED
|
@@ -1,7 +1,6 @@
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import torch.nn as nn
|
| 4 |
-
import torch.nn.functional as F
|
| 5 |
import clip
|
| 6 |
import pandas as pd
|
| 7 |
import hashlib
|
|
@@ -10,15 +9,12 @@ import cv2
|
|
| 10 |
import time
|
| 11 |
from PIL import Image
|
| 12 |
|
| 13 |
-
#
|
| 14 |
class MLP(nn.Module):
|
| 15 |
-
def __init__(self, input_size
|
| 16 |
super().__init__()
|
| 17 |
-
self.input_size = input_size
|
| 18 |
-
self.xcol = xcol
|
| 19 |
-
self.ycol = ycol
|
| 20 |
self.layers = nn.Sequential(
|
| 21 |
-
nn.Linear(
|
| 22 |
nn.Dropout(0.2),
|
| 23 |
nn.Linear(1024, 128),
|
| 24 |
nn.Dropout(0.2),
|
|
@@ -31,12 +27,14 @@ class MLP(nn.Module):
|
|
| 31 |
def forward(self, x):
|
| 32 |
return self.layers(x)
|
| 33 |
|
| 34 |
-
|
|
|
|
| 35 |
bit_string = ''.join(str(b) for b in 1 * arr.flatten())
|
| 36 |
width = int(np.ceil(len(bit_string) / 4))
|
| 37 |
return '{:0>{width}x}'.format(int(bit_string, 2), width=width)
|
| 38 |
|
| 39 |
-
|
|
|
|
| 40 |
if hash_size < 2:
|
| 41 |
raise ValueError('Hash size must be greater than or equal to 2')
|
| 42 |
|
|
@@ -48,8 +46,9 @@ def phashstr(image, hash_size=8, highfreq_factor=4):
|
|
| 48 |
dctlowfreq = dct[:hash_size, :hash_size]
|
| 49 |
med = np.median(dctlowfreq)
|
| 50 |
diff = dctlowfreq > med
|
| 51 |
-
return
|
| 52 |
|
|
|
|
| 53 |
def convert_numpy_types(data):
|
| 54 |
if isinstance(data, dict):
|
| 55 |
return {key: convert_numpy_types(value) for key, value in data.items()}
|
|
@@ -62,19 +61,13 @@ def convert_numpy_types(data):
|
|
| 62 |
else:
|
| 63 |
return data
|
| 64 |
|
| 65 |
-
|
| 66 |
-
|
| 67 |
-
|
| 68 |
-
l2 = np.atleast_1d(np.linalg.norm(a, order, axis))
|
| 69 |
-
l2[l2 == 0] = 1
|
| 70 |
-
return a / np.expand_dims(l2, axis)
|
| 71 |
-
|
| 72 |
-
def normalized(a, axis=-1, order=2):
|
| 73 |
l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
|
| 74 |
l2[l2 == 0] = 1
|
| 75 |
return a / l2
|
| 76 |
|
| 77 |
-
|
| 78 |
model = MLP(768) # CLIP embedding dim is 768 for CLIP ViT L 14
|
| 79 |
pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
|
| 80 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
@@ -82,41 +75,47 @@ model.load_state_dict(torch.hub.load_state_dict_from_url(pthpath, map_location=d
|
|
| 82 |
model.to(device).eval()
|
| 83 |
model2, preprocess = clip.load("ViT-L/14", device=device)
|
| 84 |
|
|
|
|
| 85 |
def predict(image):
|
| 86 |
-
|
| 87 |
image = Image.fromarray(image)
|
| 88 |
image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
|
| 89 |
laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
|
| 90 |
-
|
| 91 |
md5 = hashlib.md5(image.tobytes()).hexdigest()
|
| 92 |
sha1 = hashlib.sha1(image.tobytes()).hexdigest()
|
| 93 |
inputs = preprocess(image).unsqueeze(0).to(device)
|
| 94 |
|
| 95 |
with torch.no_grad():
|
|
|
|
| 96 |
start_time = time.time()
|
| 97 |
img_emb = model2.encode_image(inputs)
|
| 98 |
end_time = time.time()
|
| 99 |
print(f"Encoding image took {end_time - start_time} seconds")
|
| 100 |
|
|
|
|
| 101 |
start_time = time.time()
|
| 102 |
-
img_emb =
|
| 103 |
end_time = time.time()
|
| 104 |
print(f"Normalizing image took {end_time - start_time} seconds")
|
| 105 |
|
|
|
|
| 106 |
start_time = time.time()
|
| 107 |
prediction = model(img_emb).item()
|
| 108 |
end_time = time.time()
|
| 109 |
print(f"Making prediction took {end_time - start_time} seconds")
|
| 110 |
|
|
|
|
| 111 |
result = {
|
| 112 |
"clip_aesthetic": prediction,
|
| 113 |
-
"phash":
|
| 114 |
"md5": md5,
|
| 115 |
"sha1": sha1,
|
| 116 |
"laplacian_variance": laplacian_variance
|
| 117 |
}
|
| 118 |
return convert_numpy_types(result)
|
| 119 |
|
|
|
|
| 120 |
title = "CLIP Aesthetic Score"
|
| 121 |
description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."
|
| 122 |
|
|
|
|
| 1 |
import gradio as gr
|
| 2 |
import torch
|
| 3 |
import torch.nn as nn
|
|
|
|
| 4 |
import clip
|
| 5 |
import pandas as pd
|
| 6 |
import hashlib
|
|
|
|
| 9 |
import time
|
| 10 |
from PIL import Image
|
| 11 |
|
| 12 |
+
# MLP model definition
|
| 13 |
class MLP(nn.Module):
|
| 14 |
+
def __init__(self, input_size):
|
| 15 |
super().__init__()
|
|
|
|
|
|
|
|
|
|
| 16 |
self.layers = nn.Sequential(
|
| 17 |
+
nn.Linear(input_size, 1024),
|
| 18 |
nn.Dropout(0.2),
|
| 19 |
nn.Linear(1024, 128),
|
| 20 |
nn.Dropout(0.2),
|
|
|
|
| 27 |
def forward(self, x):
|
| 28 |
return self.layers(x)
|
| 29 |
|
| 30 |
+
# Convert binary array to hexadecimal string
|
| 31 |
+
def binary_array_to_hex(arr):
|
| 32 |
bit_string = ''.join(str(b) for b in 1 * arr.flatten())
|
| 33 |
width = int(np.ceil(len(bit_string) / 4))
|
| 34 |
return '{:0>{width}x}'.format(int(bit_string, 2), width=width)
|
| 35 |
|
| 36 |
+
# Calculate perceptual hash of an image
|
| 37 |
+
def phash(image, hash_size=8, highfreq_factor=4):
|
| 38 |
if hash_size < 2:
|
| 39 |
raise ValueError('Hash size must be greater than or equal to 2')
|
| 40 |
|
|
|
|
| 46 |
dctlowfreq = dct[:hash_size, :hash_size]
|
| 47 |
med = np.median(dctlowfreq)
|
| 48 |
diff = dctlowfreq > med
|
| 49 |
+
return binary_array_to_hex(diff)
|
| 50 |
|
| 51 |
+
# Convert NumPy types to Python built-in types
|
| 52 |
def convert_numpy_types(data):
|
| 53 |
if isinstance(data, dict):
|
| 54 |
return {key: convert_numpy_types(value) for key, value in data.items()}
|
|
|
|
| 61 |
else:
|
| 62 |
return data
|
| 63 |
|
| 64 |
+
# Normalize tensor
|
| 65 |
+
def normalize(a, axis=-1, order=2):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 66 |
l2 = torch.linalg.norm(a, dim=axis, ord=order, keepdim=True)
|
| 67 |
l2[l2 == 0] = 1
|
| 68 |
return a / l2
|
| 69 |
|
| 70 |
+
# Load pre-trained MLP model and CLIP model
|
| 71 |
model = MLP(768) # CLIP embedding dim is 768 for CLIP ViT L 14
|
| 72 |
pthpath = "https://huggingface.co/haor/aesthetics/resolve/main/sac%2Blogos%2Bava1-l14-linearMSE.pth"
|
| 73 |
device = "cuda" if torch.cuda.is_available() else "cpu"
|
|
|
|
| 75 |
model.to(device).eval()
|
| 76 |
model2, preprocess = clip.load("ViT-L/14", device=device)
|
| 77 |
|
| 78 |
+
# Predict aesthetic score and other metrics of an image
|
| 79 |
def predict(image):
|
| 80 |
+
# Preprocess image
|
| 81 |
image = Image.fromarray(image)
|
| 82 |
image_np = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2GRAY)
|
| 83 |
laplacian_variance = cv2.Laplacian(image_np, cv2.CV_64F).var()
|
| 84 |
+
phash_value = phash(image)
|
| 85 |
md5 = hashlib.md5(image.tobytes()).hexdigest()
|
| 86 |
sha1 = hashlib.sha1(image.tobytes()).hexdigest()
|
| 87 |
inputs = preprocess(image).unsqueeze(0).to(device)
|
| 88 |
|
| 89 |
with torch.no_grad():
|
| 90 |
+
# Extract image features using CLIP model
|
| 91 |
start_time = time.time()
|
| 92 |
img_emb = model2.encode_image(inputs)
|
| 93 |
end_time = time.time()
|
| 94 |
print(f"Encoding image took {end_time - start_time} seconds")
|
| 95 |
|
| 96 |
+
# Normalize image features
|
| 97 |
start_time = time.time()
|
| 98 |
+
img_emb = normalize(img_emb).float()
|
| 99 |
end_time = time.time()
|
| 100 |
print(f"Normalizing image took {end_time - start_time} seconds")
|
| 101 |
|
| 102 |
+
# Predict aesthetic score using MLP model
|
| 103 |
start_time = time.time()
|
| 104 |
prediction = model(img_emb).item()
|
| 105 |
end_time = time.time()
|
| 106 |
print(f"Making prediction took {end_time - start_time} seconds")
|
| 107 |
|
| 108 |
+
# Return prediction results
|
| 109 |
result = {
|
| 110 |
"clip_aesthetic": prediction,
|
| 111 |
+
"phash": phash_value,
|
| 112 |
"md5": md5,
|
| 113 |
"sha1": sha1,
|
| 114 |
"laplacian_variance": laplacian_variance
|
| 115 |
}
|
| 116 |
return convert_numpy_types(result)
|
| 117 |
|
| 118 |
+
# Create web interface using Gradio
|
| 119 |
title = "CLIP Aesthetic Score"
|
| 120 |
description = "Upload an image to predict its aesthetic score using the CLIP model and calculate other image metrics."
|
| 121 |
|