1 min read

CLIP Model

CLIP Model
Photo by Gerard Siderius / Unsplash

Creating a customer script to run CLIP on sagemaker.

# clip/code/inference.py

import base64
from io import BytesIO
from PIL import Image
from transformers import CLIPProcessor, CLIPModel


def model_fn(model_dir):
    # Load model model_dir
    model = CLIPModel.from_pretrained(model_dir)
    processor = CLIPProcessor.from_pretrained(model_dir)
    return model, processor

def predict_fn(data, model_and_processor):
    # destruct model and processor
    model, processor = model_and_processor
    b64, candidates = data['inputs']['image'], 
                      data['inputs']['candidates']
    image = Image.open(BytesIO(base64.b64decode(b64)))
    inputs = processor(text=candidates, 
                       images=image, 
                       return_tensors="pt", 
                       padding=True)
    outputs = model(**inputs)
    # this is the image-text similarity score
    logits_per_image = outputs.logits_per_image 
    # we can take the softmax to get the label probabilities
    probs = logits_per_image.softmax(dim=1) 
    # return dictonary, which will be json serializable
    return {"probs": probs}