You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
import requests
import torch
from PIL import Image
from transformers import AutoProcessor, AutoModelForVision2Seq
from io import BytesIO
images = ["https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
"https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
"https://cdn.britannica.com/68/170868-050-8DDE8263/Golden-Gate-Bridge-San-Francisco.jpg",
]
images = [Image.open(BytesIO(requests.get(image).content)) for image in images]
# Note that passing the image urls (instead of the actual pil images) to the processor is also possible
processor = AutoProcessor.from_pretrained("HuggingFaceM4/idefics2-8b")
model = AutoModelForVision2Seq.from_pretrained(
"HuggingFaceM4/idefics2-8b",
torch_dtype=torch.bfloat16,
_attn_implementation="flash_attention_2",
device_map='auto',
)
# Create inputs
messages = [
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "What do we see in this image?"},
]
},
{
"role": "assistant",
"content": [
{"type": "text", "text": "In this image, we can see the city of New York, and more specifically the Statue of Liberty."},
]
},
{
"role": "user",
"content": [
{"type": "image"},
{"type": "text", "text": "And how about this image?"},
]
},
]
prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
inputs = processor(text=prompt, images=[images[0], images[1]], return_tensors="pt")
with torch.device('cuda'):
inputs = {k: v.to('cuda') for k, v in inputs.items()}
# Generate
generated_ids = model.generate(**inputs, max_new_tokens=500)
generated_texts = processor.batch_decode(generated_ids, skip_special_tokens=True)
print(generated_texts)
gives:
['User: What do we see in this image? \nAssistant: In this image, we can see the city of New York, and more specifically the Statue of Liberty. \nUser: And how about this image? \nAssistant: In this image we can see buildings, trees, lights, water and sky.']
The text was updated successfully, but these errors were encountered:
requires dev transformers 4.40.0
They said should be in TGI soon.
gives:
The text was updated successfully, but these errors were encountered: